00001
00018
00019
00020
00021
00022
00023 #include "HuffwordLib.h"
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00050 void Huffw_compress(char *text, int tlen, int jump_value, char **cstring, int *clen, int Verbose)
00051 {
00052
00053 char *body;
00054 int blen;
00055 Console c;
00056
00057
00058 if ((tlen <= 0) || (text == NULL)){
00059 fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (Huffw_compress)\n");
00060 exit(-1); }
00061
00062
00063
00064 HHashtable_init(&(c.hashtable),tlen/1000+5);
00065
00066 HParse_text(text,tlen,&(c.hashtable));
00067
00068 HCompress_getBodyAndConsole(text,tlen,jump_value,&body,&blen,&c);
00069
00070 HCompress_getString(cstring,clen,tlen,body,blen,&c);
00071
00072 free(body);
00073
00074
00075 **cstring = 0;
00076
00077 if(Verbose)
00078 Huffw_PrintInfo(&c,blen,tlen,*clen,0,Verbose);
00079
00080 }
00081
00082
00093 void Huffw_decompress(char *cstring, int clen, char **text, int *tlen, int Verbose)
00094 {
00095 Console c;
00096 char *body;
00097 int blen;
00098
00099 if ((clen <= 0) || (cstring == NULL)){
00100 fprintf(stderr,"Fatal Error: You're trying to decompress an empty file! (Huffw_decompress)\n");
00101 exit(-1); }
00102
00103
00104 if(*cstring != 0){
00105 fprintf(stderr,"Fatal Error: First byte of compressed file not 0, actually %d\n", *cstring);
00106 exit(-1); }
00107
00108
00109 HDecompress_getBodyAndConsole(cstring,clen,tlen,&body,&blen,&c);
00110
00111 HDecompress_nextBlock_bytes(body,blen,text,tlen, &c);
00112
00113 if(Verbose)
00114 Huffw_PrintInfo(&c,blen,*tlen,clen,0,Verbose);
00115
00116 }
00117
00118
00119
00120
00134 void Huffw_spaceless_compress(char *text, int tlen, char **cstring, int *clen, int Verbose)
00135 {
00136
00137 Console c;
00138 char *body, *body_spaceless;
00139 int blen, blen_spaceless;
00140 int jump_value = 0;
00141
00142 if ((tlen <= 0) || (text == NULL)){
00143 fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (Huffw_spaceless_compress)\n");
00144 exit(-1); }
00145
00146
00147 HHashtable_init(&(c.hashtable),tlen/1000+5);
00148
00149 HParse_text(text,tlen,&(c.hashtable));
00150
00151 HCompress_getBodyAndConsole(text,tlen,jump_value,&body,&blen,&c);
00152
00153 HCompress_contractSpaces(body,blen,&body_spaceless, &blen_spaceless, &c);
00154
00155 free(body);
00156
00157 HCompress_getString(cstring,clen,tlen,body_spaceless,blen_spaceless,&c);
00158
00159 free(body_spaceless);
00160
00161
00162 **cstring = 1;
00163
00164 if(Verbose)
00165 Huffw_PrintInfo(&c,blen_spaceless,tlen,*clen,1,Verbose);
00166
00167 }
00168
00169
00170
00171
00183 void Huffw_spaceless_decompress(char *cstring, int clen, char **text, int *tlen, int Verbose)
00184 {
00185 Console c;
00186 char *body, *body_spaceless;
00187 int blen, blen_spaceless;
00188
00189 if ((clen <= 0) || (cstring == NULL)){
00190 fprintf(stderr,"Fatal Error: You're trying to decompress an empty file! (Huffw_spaceless_decompress)\n");
00191 exit(-1); }
00192
00193
00194
00195 if(*cstring != 1){
00196 fprintf(stderr,"Fatal Error: First byte of compressed file not 1, actually %d\n", *cstring);
00197 exit(-1); }
00198
00199
00200 HDecompress_getBodyAndConsole(cstring,clen,tlen,&body_spaceless,&blen_spaceless,&c);
00201
00202 HDecompress_expandSpaces(body_spaceless, blen_spaceless, &body, &blen,&c);
00203
00204 HDecompress_nextBlock_bytes(body,blen,text,tlen,&c);
00205
00206 if(Verbose)
00207 Huffw_PrintInfo(&c,blen_spaceless,*tlen,clen,1,Verbose);
00208
00209 }
00210
00211
00212
00225 void Huffw_PrintInfo(Console *c, int ctext_len, int text_len, int cstring_len, int rule, int Verbose)
00226 {
00227 int i,j;
00228 float f = ((float)cstring_len)/((float)text_len);
00229
00230 if (c->jumpers.number){
00231 j = text_len/c->jumpers.number;
00232 } else { j=0; }
00233 printf("\n------------------- Global infos --------------------\n\n");
00234 printf("Parsing rule = %d\n", rule);
00235 printf("Text length = %d\n", text_len);
00236 printf("Compressed body length = %d\n", ctext_len);
00237 printf("Dictionary size = %d, Dictionary length = %d\n",c->dictionary.num_tokens,c->dictionary.length);
00238 printf("Max codeword length = %d\n",c->canoinfo.max_cwlen);
00239 printf("Number of jumpers = %d, average jump size = %d chars\n\n", c->jumpers.number,j);
00240 for(i=c->canoinfo.max_cwlen; i>0;i--)
00241 printf("Offset[%d] = %d, First_cw[%d] = %x\n",i,c->canoinfo.offsetcw[i],i,c->canoinfo.firstcw[i]);
00242 printf("\n Overall compressed file length = %d, compression ratio (percentage) = %.1f \n\n",
00243 cstring_len, f * 100);
00244
00245 if (Verbose > 1)
00246 HDictionary_print(&(c->dictionary), &(c->hashtable), 1);
00247
00248 }
00249
00250
00251
00252
00253
00254
00255
00256
00257
00273 void HParse_text(char *text, int text_len, HHash_table *ht)
00274 {
00275 int token_len,i;
00276 char *token;
00277
00278
00279 if ((text_len <= 0) || (text == NULL)){
00280 fprintf(stderr,"Fatal Error: You're trying to parse an empty file! (HParse_text)\n");
00281 exit(-1); }
00282
00283 for(i=0; i<text_len; i+=token_len){
00284
00285
00286 HToken_getLengthNext(text+i,text_len-i,&token_len);
00287 token = text+i;
00288
00289
00290 HHashtable_insert(token,token_len,ht);
00291 }
00292
00293 }
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00317 void HCompress_getBody(char *text, int text_len, HHash_table *ht, char **ctext, int *ctext_len)
00318
00319 {
00320 int i;
00321 int token_len, ctext_allocated;
00322 char *token;
00323 int taggedcw;
00324 int taggedcwlen;
00325 char *s;
00326 float resize_factor = 1.4;
00327
00328
00329 if ((text_len <= 0) || (text == NULL)){
00330 fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_getBody)\n");
00331 exit(-1); }
00332
00333
00334 *ctext_len=0;
00335 ctext_allocated = text_len/2;
00336 *ctext = (char *) malloc((ctext_allocated+1) * sizeof(char));
00337
00338 if((*ctext == NULL) || (text_len == 0)){
00339 fprintf(stderr,"Error: memory allocation (HCompress_getBody)\n");
00340 exit(-1); }
00341
00342 for(i=0; i<text_len; i+=token_len){
00343
00344
00345 token = text+i;
00346 HToken_getLengthNext(text+i,text_len-i,&token_len);
00347
00348
00349 HCodeword_TaggedFromToken(token,token_len,ht,&taggedcw,&taggedcwlen);
00350
00351
00352 HCodeword_tostring(taggedcw,taggedcwlen,&s);
00353
00354
00355 if( (*ctext_len+taggedcwlen) >= ctext_allocated ) {
00356 *ctext = (char*) realloc(*ctext,(resize_factor*ctext_allocated + taggedcwlen + 1)*sizeof(char));
00357 if ( !(*ctext) ) { fprintf(stderr, "memory exhausted in HCompress_getBody\n"); exit(-1); }
00358 ctext_allocated = ctext_allocated * resize_factor + taggedcwlen + 1;
00359 }
00360
00361
00362 memcpy((*ctext) + (*ctext_len),s,taggedcwlen);
00363 (*ctext_len) += taggedcwlen;
00364
00365 free(s);
00366 }
00367 }
00368
00369
00370
00371
00372
00387 void HCompress_getBodyAndConsole(char *text, int tlen, int jump_value, char **body, int *blen,
00388 Console *console)
00389 {
00390
00391 Hash_nodeptr_array tree;
00392 Canonical *cano = &(console->canoinfo);
00393 Dictionary *dict = &(console->dictionary);
00394 Jumpers *jump = &(console->jumpers);
00395 HHash_table *ht = &(console->hashtable);
00396
00397
00398 if ((tlen <= 0) || (text == NULL)){
00399 fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_getBodyAndConsole)\n");
00400 exit(-1); }
00401
00402
00403
00404 Hufftree_build(&tree,ht);
00405
00406
00407 HCanonical_fromtree(tree,cano,ht);
00408
00409
00410 HDictionary_fromtree(tree,cano->num_tokens,dict);
00411
00412
00413 HCompress_getBody(text,tlen,ht,body,blen);
00414
00415
00416 HJumpers_fromds(text,tlen,ht,jump_value,jump);
00417
00418 }
00419
00420
00421
00422
00449 void HCompress_getString(char **ctext, int *ctext_len, int text_len,char *body, int blen,
00450 Console *console)
00451 {
00452 int global_length;
00453 Canonical *cano = &(console->canoinfo);
00454 Dictionary *dict = &(console->dictionary);
00455 Jumpers *jump = &(console->jumpers);
00456
00457
00458 if ((text_len <= 0) || (body == NULL) || (blen <=0)){
00459 fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_getString)\n");
00460 exit(-1); }
00461
00462
00463 global_length = 57 + 8 * jump->number + dict->length + blen;
00464
00465 *ctext = (char *) malloc(global_length * sizeof(char));
00466 *ctext_len=0;
00467
00468
00469 *ctext_len += 1;
00470
00471 HInt_tostring(*ctext + *ctext_len, text_len);
00472 *ctext_len += 4;
00473
00474 HInt_tostring(*ctext + *ctext_len, dict->num_tokens);
00475 *ctext_len += 4;
00476
00477 HCanonical_tostring(*ctext + *ctext_len, cano);
00478 *ctext_len += 36;
00479
00480 HJumpers_tostring(*ctext + *ctext_len, jump);
00481 *ctext_len += 8 * jump->number + 4;
00482
00483 HInt_tostring(*ctext + *ctext_len, dict->length);
00484 *ctext_len += 4;
00485 memcpy(*ctext + *ctext_len, dict->content, dict->length);
00486 *ctext_len += dict->length;
00487
00488 HInt_tostring(*ctext + *ctext_len, blen);
00489 *ctext_len += 4;
00490 memcpy(*ctext + *ctext_len, body, blen);
00491 *ctext_len += blen;
00492
00493 if (global_length != *ctext_len){
00494 fprintf(stderr,"Error: computing the compressed file length (HCompress_getString)\n");
00495 exit(-1); }
00496
00497
00498 }
00499
00500
00511 void HCompress_contractSpaces(char *body, int blen, char **body_spaceless, int *blen_spaceless, Console *console)
00512 {
00513 char *token, *sstring;
00514 int token_len, tcw_len, i;
00515 int spacecw, spacecw_len;
00516 char spacecw_string[4];
00517 int state, next_state;
00518
00519
00520 if ((body == NULL) || (blen <=0)){
00521 fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_contractSpaces)\n");
00522 exit(-1); }
00523
00524
00525 if(HHashtable_search(" ",1,&(console->hashtable)) == NULL){
00526 *body_spaceless = body;
00527 *blen_spaceless = blen;
00528 } else {
00529
00530 *body_spaceless = (char *) malloc(blen * sizeof(char));
00531 if (*body_spaceless == NULL) {
00532 fprintf(stderr,"Fatal Error: No memory for spaceless body (HCompress_contractSpaces\n");
00533 exit(-1); }
00534
00535
00536
00537 HCodeword_TaggedFromToken(" ", 1, &(console->hashtable), &spacecw, &spacecw_len);
00538 HInt_tostring(spacecw_string,spacecw);
00539 sstring = spacecw_string + (4-spacecw_len);
00540
00541 *blen_spaceless = 0;
00542
00543
00544
00545
00546
00547
00548 state = 0;
00549
00550 for(i=0; i<blen; i += tcw_len) {
00551
00552 HToken_decompressNext(body+i,blen-i,&token,&token_len,&tcw_len,console);
00553
00554
00555 if ((state == 0) && (isalnum(*token)))
00556 next_state = 1;
00557 else if ((state == 1) && (memcmp(token," ",1) == 0) & (token_len == 1))
00558 next_state = 2;
00559 else if ((state == 1) && (isalnum(*token)))
00560 next_state = 1;
00561 else if ((state == 2) && (isalnum(*token)))
00562 next_state = 3;
00563 else next_state = 0;
00564
00565 free(token);
00566
00567
00568 if ((state == 2) && (next_state != 3)) {
00569 memcpy(*body_spaceless + *blen_spaceless, sstring, spacecw_len);
00570 *blen_spaceless += spacecw_len;
00571 }
00572 if (next_state != 2){
00573 memcpy(*body_spaceless + *blen_spaceless, body+i, tcw_len);
00574 *blen_spaceless += tcw_len;
00575 }
00576
00577
00578 if (next_state == 3)
00579 next_state = 1;
00580 state = next_state;
00581
00582 }
00583
00584
00585 if (state == 2) {
00586 memcpy(*body_spaceless + *blen_spaceless, sstring, spacecw_len);
00587 *blen_spaceless += spacecw_len;
00588 }
00589 }
00590
00591 }
00592
00593
00594
00609 void HDecompress_getBodyAndConsole(char *ctext, int ctext_len, int *text_len, char **body, int *blen,
00610 Console *console)
00611 {
00612 int i;
00613 int len;
00614 Canonical *cano = &(console->canoinfo);
00615 Dictionary *dict = &(console->dictionary);
00616 Jumpers *jump = &(console->jumpers);
00617 HHash_table *hashtable = &(console->hashtable);
00618
00619
00620
00621 if ((ctext == NULL) || (ctext_len <=0)){
00622 fprintf(stderr,"Fatal Error: You're trying to decompress an empty file! (HDecompress_getBodyAndConsole)\n");
00623 exit(-1); }
00624
00625
00626 i=0;
00627
00628
00629 i += 1;
00630
00631 *text_len = HInt_fromstring(ctext + i);
00632 i += 4;
00633
00634 cano->num_tokens = HInt_fromstring(ctext + i);
00635 i += 4;
00636
00637 HCanonical_fromstring(ctext + i, cano);
00638 i += 36;
00639
00640 HJumpers_fromstring(ctext + i, jump);
00641 i += 8 * jump->number + 4;
00642
00643 len = HInt_fromstring(ctext + i);
00644 i += 4;
00645 HDictionary_fromstring(ctext+i,len,cano->num_tokens,dict);
00646 i += dict->length;
00647
00648 HHashtable_fromdict(dict,cano,hashtable);
00649
00650 *blen = HInt_fromstring(ctext + i);
00651 i += 4;
00652 *body = ctext + i;
00653 i += *blen;
00654
00655 if (i != ctext_len){
00656 fprintf(stderr,"Error: computing the compressed file length (HDecompress_getBodyAndConsole)\n");
00657 exit(-1); }
00658
00659 }
00660
00661
00662
00663
00674 void HDecompress_expandSpaces(char *body_spaceless, int blen_spaceless, char **body, int *blen, Console *console)
00675 {
00676
00677 char *token, *sstring;
00678 int token_len, tcw_len, i;
00679 int spacecw, spacecw_len;
00680 char spacecw_string[4];
00681 int state, next_state;
00682
00683
00684 if ( (body_spaceless == NULL) || (blen_spaceless <= 0) ) {
00685 *body = NULL;
00686 *blen = 0;
00687 return ;
00688 }
00689
00690
00691 *body = (char *) malloc(blen_spaceless * 2 * sizeof(char));
00692 *blen = 0;
00693 if (*body == NULL) {
00694 fprintf(stderr,"Fatal Error: No memory for body (HDecompress_expandSpaces\n");
00695 exit(-1); }
00696
00697
00698
00699 if(HHashtable_search(" ",1,&(console->hashtable)) == NULL){
00700 *body = body_spaceless;
00701 *blen = blen_spaceless;
00702 } else {
00703
00704 HCodeword_TaggedFromToken(" ", 1, &(console->hashtable), &spacecw, &spacecw_len);
00705 HInt_tostring(spacecw_string,spacecw);
00706 sstring = spacecw_string + (4-spacecw_len);
00707
00708
00709
00710
00711
00712
00713 state = 0;
00714
00715 for(i=0; i<blen_spaceless; i += tcw_len) {
00716
00717 HToken_decompressNext(body_spaceless+i,blen_spaceless-i,&token,&token_len,&tcw_len,console);
00718
00719
00720 if ((state == 0) && (isalnum(*token)))
00721 next_state = 1;
00722 else if ((state == 1) && (isalnum(*token)))
00723 next_state = 3;
00724 else next_state = 0;
00725
00726 free(token);
00727
00728
00729 if (next_state == 3) {
00730 memcpy(*body + *blen, sstring, spacecw_len);
00731 *blen += spacecw_len;
00732 }
00733 memcpy(*body + *blen, body_spaceless+i, tcw_len);
00734 *blen += tcw_len;
00735
00736
00737 if (next_state == 3)
00738 next_state = 1;
00739 state = next_state;
00740 }
00741 }
00742 }
00743
00744
00745
00746
00761 void HDecompress_nextBlock_bytes(char *ctext, int num_bytes, char **text, int *text_len, Console *console)
00762 {
00763 int i;
00764 int token_len, allocated_text;
00765 char *token;
00766 int taggedcwlen;
00767 float resize_factor = 1.4;
00768
00769
00770 if ( (ctext == NULL) || (num_bytes <= 0) ) {
00771 *text = NULL;
00772 *text_len = 0;
00773 return ;
00774 }
00775
00776
00777 if (((*ctext) & (0x80)) == 0){
00778 fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_bytes)\n");
00779 exit(-1); }
00780
00781 *text_len=0;
00782 allocated_text = 2 * num_bytes;
00783 *text = (char *) malloc(allocated_text * sizeof(char));
00784
00785 if((*text == NULL) || (num_bytes == 0)){
00786 fprintf(stderr,"Error: memory allocation (HDecompress_nextBlock_bytes)\n");
00787 exit(-1); }
00788
00789 for(i=0; i<num_bytes; i+=taggedcwlen){
00790
00791
00792 HToken_decompressNext(ctext+i,num_bytes-i,&token,&token_len,&taggedcwlen,console);
00793
00794
00795 if( (*text_len+token_len) >= allocated_text) {
00796 *text = (char*)realloc(*text,(resize_factor*allocated_text+1+token_len)*sizeof(char));
00797 if ( !(*text) ) { fprintf(stderr, "memory exhausted in HDecompress_nextBlock_bytes\n"); exit(-1); }
00798 allocated_text = allocated_text * resize_factor + 1 + token_len;
00799 }
00800
00801
00802 memcpy((*text) + (*text_len),token,token_len);
00803 (*text_len) += token_len;
00804
00805 free(token);
00806 }
00807
00808 }
00809
00810
00811
00812
00813
00827 void HDecompress_nextBlock_bytes_spaceless(char *ctext, int num_bytes, char **text,
00828 int *text_len, Console *console)
00829 {
00830 int exp_len;
00831 char *exp_text;
00832
00833
00834
00835 if ( (ctext == NULL) || (num_bytes <= 0) ) {
00836 *text = NULL;
00837 *text_len = 0;
00838 return ;
00839 }
00840
00841
00842 if (((*ctext) & (0x80)) == 0){
00843 fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_bytes_spaceless)\n");
00844 exit(-1); }
00845
00846
00847
00848 HDecompress_expandSpaces(ctext, num_bytes, &exp_text, &exp_len,console);
00849
00850
00851 HDecompress_nextBlock_bytes(exp_text, exp_len, text, text_len, console);
00852
00853 }
00854
00855
00856
00857
00858
00859
00875 void HDecompress_previousBlock_bytes(char *ctext, int num_bytes, char **text, int *text_len,
00876 Console *console)
00877 {
00878 int i;
00879 int token_len, allocated_text;
00880 char *token;
00881 int taggedcwlen;
00882 float resize_factor = 1.4;
00883
00884
00885
00886 if ( (ctext == NULL) || (num_bytes <= 0) ) {
00887 *text = NULL;
00888 *text_len = 0;
00889 return ;
00890 }
00891
00892
00893 if (((*ctext) & (0x80)) == 0){
00894 fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_bytes)\n");
00895 exit(-1); }
00896
00897 *text_len=0;
00898 allocated_text = 2 * num_bytes;
00899 *text = (char *) malloc(allocated_text * sizeof(char));
00900
00901 if((*text == NULL) || (num_bytes == 0)){
00902 fprintf(stderr,"Error: memory allocation (HDecompress_previousBlock_bytes)\n");
00903 exit(-1); }
00904
00905
00906 for(i=0; i<num_bytes; i+=taggedcwlen){
00907
00908
00909 HToken_decompressPrevious(ctext-i,num_bytes-i,&token,&token_len,&taggedcwlen,console);
00910
00911 free(token);
00912
00913 }
00914
00915 i -= taggedcwlen;
00916
00917 for(; i>=0; i-=taggedcwlen){
00918
00919
00920 HToken_decompressNext(ctext-i,num_bytes-i,&token,&token_len,&taggedcwlen,console);
00921
00922
00923 if( (*text_len+token_len) >= allocated_text) {
00924 *text = (char*)realloc(*text,(resize_factor*allocated_text+1+token_len)*sizeof(char));
00925 if ( !(*text) ) { fprintf(stderr, "memory exhausted in HDecompress_previousBlock_bytes\n"); exit(-1); }
00926 allocated_text = allocated_text * resize_factor + token_len + 1;
00927 }
00928
00929
00930 memcpy((*text) + (*text_len),token,token_len);
00931 (*text_len) += token_len;
00932
00933 free(token);
00934 }
00935
00936 }
00937
00938
00952 void HDecompress_previousBlock_bytes_spaceless(char *ctext, int num_bytes, char **text,
00953 int *text_len, Console *console)
00954 {
00955 int exp_len;
00956 char *exp_text;
00957
00958
00959
00960 if ( (ctext == NULL) || (num_bytes <= 0) ) {
00961 *text = NULL;
00962 *text_len = 0;
00963 return ;
00964 }
00965
00966 if (((*ctext) & (0x80)) == 0){
00967 fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_bytes_spaceless)\n");
00968 exit(-1); }
00969
00970
00971
00972 while( ( (*(ctext-num_bytes)) & (0x80) ) == 0){
00973 num_bytes++;
00974 }
00975
00976
00977
00978 HDecompress_expandSpaces(ctext-num_bytes, num_bytes, &exp_text, &exp_len,console);
00979
00980
00981 HDecompress_nextBlock_bytes(exp_text, exp_len, text, text_len, console);
00982
00983 }
00984
00985
00986
00987
01004 void HDecompress_nextBlock_tokens(char *ctext, int bytes_left, int num_obj,
01005 char **text, int *text_len, int *decoded,
01006 Console *console)
01007 {
01008 int token_len;
01009 char *token;
01010 int taggedcwlen;
01011
01012
01013
01014 if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01015 *text = NULL;
01016 *text_len = 0;
01017 *decoded = 0;
01018 return ;
01019 }
01020
01021
01022 if (((*ctext) & (0x80)) == 0){
01023 fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_tokens)\n");
01024 exit(-1); }
01025
01026 for(*decoded=0; (*decoded < bytes_left) && (num_obj > 0); *decoded += taggedcwlen){
01027
01028
01029 HToken_decompressNext(ctext + *decoded,bytes_left-*decoded,
01030 &token,&token_len,&taggedcwlen,console);
01031
01032 if (isalnum(token[0])){
01033 num_obj--;
01034 }
01035
01036 free(token);
01037 }
01038
01039
01040 HDecompress_nextBlock_bytes(ctext,*decoded,text,text_len,console);
01041
01042 }
01043
01044
01045
01046
01061 void HDecompress_nextBlock_tokens_spaceless(char *ctext, int bytes_left, int num_obj,
01062 char **text, int *text_len, int *decoded,
01063 Console *console)
01064 {
01065 int text_tmp_len,exp_len;
01066 char *text_tmp, *exp_text;
01067
01068
01069
01070 if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01071 *text = NULL;
01072 *text_len = 0;
01073 *decoded = 0;
01074 return ;
01075 }
01076
01077
01078 if (((*ctext) & (0x80)) == 0){
01079 fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_tokens_spaceless)\n");
01080 exit(-1); }
01081
01082
01083 HDecompress_nextBlock_tokens(ctext,bytes_left,num_obj,
01084 &text_tmp,&text_tmp_len,decoded,console);
01085
01086
01087 HDecompress_expandSpaces(ctext,*decoded, &exp_text, &exp_len,console);
01088
01089
01090
01091 HDecompress_nextBlock_bytes(exp_text,exp_len,text,text_len,console);
01092
01093 }
01094
01095
01096
01097
01098
01115 void HDecompress_previousBlock_tokens(char *ctext, int bytes_left, int num_obj,
01116 char **text, int *text_len, int *decoded,
01117 Console *console)
01118 {
01119 int token_len;
01120 char *token;
01121 int taggedcwlen;
01122
01123
01124
01125 if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01126 *text = NULL;
01127 *text_len = 0;
01128 *decoded = 0;
01129 return ;
01130 }
01131
01132
01133 if (((*ctext) & (0x80)) == 0){
01134 fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_tokens)\n");
01135 exit(-1); }
01136
01137 for(*decoded=0; (*decoded < bytes_left) && (num_obj > 0); *decoded += taggedcwlen){
01138
01139
01140 HToken_decompressPrevious(ctext - *decoded, bytes_left - *decoded,
01141 &token,&token_len,&taggedcwlen,console);
01142 if (isalnum(token[0])){
01143 num_obj--;
01144 }
01145
01146 free(token);
01147 }
01148
01149 if (((*(ctext - *decoded)) & (0x80)) ==0){
01150 fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_Tokens)\n");
01151 exit(-1); }
01152
01153
01154 HDecompress_nextBlock_bytes(ctext - *decoded, *decoded,text,text_len,console);
01155
01156 }
01157
01158
01159
01160
01176 void HDecompress_previousBlock_tokens_spaceless(char *ctext, int bytes_left, int num_obj,
01177 char **text, int *text_len, int *decoded,
01178 Console *console)
01179 {
01180 int text_tmp_len, exp_len;
01181 char *text_tmp, *exp_text;
01182
01183
01184
01185 if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01186 *text = NULL;
01187 *text_len = 0;
01188 *decoded = 0;
01189 return ;
01190 }
01191
01192
01193 if (((*ctext) & (0x80)) == 0){
01194 fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_tokens_spaceless)\n");
01195 exit(-1); }
01196
01197
01198
01199 HDecompress_previousBlock_tokens(ctext,bytes_left,num_obj,
01200 &text_tmp,&text_tmp_len,decoded,console);
01201
01202
01203 HDecompress_expandSpaces(ctext-*decoded, *decoded, &exp_text, &exp_len, console);
01204
01205
01206
01207 HDecompress_nextBlock_bytes(exp_text,exp_len,text,text_len,console);
01208
01209 }
01210
01211
01212
01213
01214
01215
01216
01217
01218
01219
01220
01221
01222
01223
01224
01235 void HToken_RankFromPlainCw(int plaincw, int plaincwlen, Canonical *cano,
01236 int *token_rank)
01237 {
01238 int len = plaincwlen / 7;
01239
01240 if((plaincwlen % 7) != 0){
01241 fprintf(stderr,"Error: HToken_RankFromPlainCw - length no 7-multiple\n");
01242 exit(-1); }
01243
01244 *token_rank = cano->offsetcw[len] + plaincw - cano->firstcw[len];
01245
01246 }
01247
01248
01260 void HToken_RankFromTaggedCw(int taggedcw,int taggedcwlen,Canonical *cano,int *token_rank)
01261 {
01262 int plaincw,plaincwlen;
01263
01264 if((taggedcwlen > 4) || (taggedcwlen < 1)){
01265 fprintf(stderr,"Error: HToken_RankFromTaggedCw - length no in [1,4]\n");
01266 exit(-1); }
01267
01268 HCodeword_PlainFromTagged(taggedcw,taggedcwlen,&plaincw,&plaincwlen);
01269
01270 if((plaincwlen % 7) != 0){
01271 fprintf(stderr,"Error: length plain no 7-multiple (HToken_RankFromTaggedCw)\n");
01272 exit(-1); }
01273
01274 HToken_RankFromPlainCw(plaincw,plaincwlen,cano,token_rank);
01275
01276 }
01277
01278
01279
01290 void HToken_fromTaggedCw(int taggedcw, int taggedcwlen,
01291 Canonical *cano, Dictionary *dict,
01292 char **token, int *token_len)
01293 {
01294 int token_rank;
01295 int token_startpos;
01296
01297 HToken_RankFromTaggedCw(taggedcw,taggedcwlen,cano,&token_rank);
01298
01299 token_startpos = dict->start_pos[token_rank];
01300 if ((token_startpos < 0) || (token_startpos >= dict->length)){
01301 fprintf(stderr,"Error: token_startpos computation (HToken_fromTaggedCw)\n");
01302 exit(-1); }
01303
01304 *token_len=1;
01305
01306 while(dict->content[token_startpos + (*token_len)] != '\n')
01307 (*token_len)++;
01308
01309 *token = (char *) malloc(*token_len);
01310 memcpy(*token,dict->content + token_startpos, *token_len);
01311
01312 if(*token == NULL) {
01313 fprintf(stderr,"Error: memory allocation for token (HToken_FromTaggedCw)\n");
01314 exit(-1); }
01315
01316 }
01317
01318
01333 int HToken_decompressNext(char *s, int num_byte_left, char **token, int *lentoken, int *lencw,
01334 Console *console)
01335 {
01336
01337 int taggedcw;
01338
01339 if(num_byte_left <= 0) return 0;
01340
01341
01342 HCodeword_TaggedGetNext(s,num_byte_left,&taggedcw,lencw);
01343
01344
01345 HToken_fromTaggedCw(taggedcw,*lencw,&(console->canoinfo),&(console->dictionary),token,lentoken);
01346
01347 return 1;
01348
01349 }
01350
01351
01366 int HToken_decompressPrevious(char *s, int num_byte_left, char **token, int *lentoken, int *lencw,
01367 Console *console)
01368 {
01369
01370 int taggedcw;
01371
01372 if(num_byte_left <= 0) return 0;
01373
01374
01375 HCodeword_TaggedGetPrevious(s,num_byte_left,&taggedcw,lencw);
01376
01377
01378 HToken_fromTaggedCw(taggedcw,*lencw,&(console->canoinfo),&(console->dictionary),token,lentoken);
01379
01380 return 1;
01381
01382 }
01383
01384
01385
01397 void HToken_getLengthNext(char *s, int num_char_left, int *len)
01398 {
01399
01400 *len = 0;
01401
01402 if (num_char_left <= 0) {
01403 fprintf(stderr,"Error: HToken_getLengthNext\n");
01404 exit(-1); }
01405
01406
01407 if (!isalnum(*s)) {
01408
01409 while ( (*len < num_char_left) && (!isalnum(*s)) && ((*s) != '\n')) {
01410 (*len)++; s++; }
01411
01412
01413 if (*len == 0) *len = 1;
01414
01415 } else {
01416
01417 while ( (*len < num_char_left) && isalnum(*s) ){
01418 (*len)++; s++; }
01419 }
01420
01421 if(*len <= 0){
01422 fprintf(stderr,"Error: HToken_getLengthNext\n");
01423 exit(-1); }
01424 }
01425
01426
01427
01428
01429
01430
01431
01432
01443 void HCodeword_PlainFromTagged(int taggedcw, int taggedcwlen, int *plaincw, int *plaincwlen)
01444 {
01445 *plaincw=0;
01446 *plaincwlen=0;
01447
01448 if((taggedcwlen > 4) || (taggedcwlen < 1)){
01449 fprintf(stderr,"Error: HCodeword_PlainFromTagged - length no in [1,4]\n");
01450 exit(-1); }
01451
01452 while (taggedcwlen > 0) {
01453 *plaincw += (taggedcw & 0x7F) << (*plaincwlen);
01454 taggedcw >>= 8;
01455 *plaincwlen += 7;
01456 taggedcwlen --;
01457 }
01458
01459 }
01460
01461
01472 void HCodeword_PlainFromTokenrank(int token_rank,Canonical *cano,int *plaincw,int *plaincwlen)
01473 {
01474 int i;
01475
01476
01477 for(i=1; (i < 5) && (cano->offsetcw[i] > token_rank); i++)
01478 ;
01479
01480 *plaincw = cano->firstcw[i] + (token_rank - cano->offsetcw[i]);
01481 *plaincwlen= 7 * i;
01482
01483 }
01484
01485
01486
01487
01497 void HCodeword_TaggedFromPlain(int plaincw, int plaincwlen, int *taggedcw, int *taggedcwlen)
01498 {
01499 *taggedcw=0;
01500 *taggedcwlen=0;
01501
01502 if((plaincwlen % 7) != 0){
01503 fprintf(stderr,"Error: HCodeword_TaggedFromPlain, len %d\n",plaincwlen);
01504 exit(-1); }
01505
01506 while (plaincwlen > 0) {
01507 *taggedcw |= ((plaincw & 0x7F) << (8 * (*taggedcwlen)));
01508 (*taggedcwlen)++;
01509 plaincw >>= 7;
01510 plaincwlen -=7;
01511 }
01512
01513 *taggedcw |= (0x80 << (8 * (*taggedcwlen - 1)));
01514 }
01515
01525 void HCodeword_TaggedFromTokenrank(int token_rank,Canonical *cano, int *taggedcw, int *taggedcwlen)
01526 {
01527 int plaincw,plaincwlen;
01528
01529 HCodeword_PlainFromTokenrank(token_rank,cano,&plaincw,&plaincwlen);
01530
01531 if((plaincwlen % 7) != 0){
01532 fprintf(stderr,"Error: HCodeword_TaggedFromTokenrank -length plain no 7-multiple\n");
01533 exit(-1); }
01534
01535 HCodeword_TaggedFromPlain(plaincw,plaincwlen,taggedcw,taggedcwlen);
01536
01537 if((*taggedcwlen > 4) || (*taggedcwlen < 1)){
01538 fprintf(stderr,"Error: HCodeword_TaggedFromTokenrank - length tagged no in [1,4]\n");
01539 exit(-1); }
01540
01541 }
01542
01543
01544
01553 void HCodeword_TaggedFromToken(char *token,int token_len,HHash_table *ht,int *taggedcw, int *taggedcwlen)
01554 {
01555 Hash_node *p;
01556
01557 if((p=HHashtable_search(token,token_len,ht)) == NULL){
01558 fprintf(stderr,"Error: token not found (HCodeword_TaggedFromToken)\n");
01559 exit(-1); }
01560
01561 *taggedcw = p->codeword;
01562 *taggedcwlen = p->cw_len;
01563
01564 }
01565
01566
01578 void HCodeword_TaggedGetNext(char *s, int num_byte_left, int *taggedcw, int *lencw)
01579 {
01580
01581 if (((*s) & 0x80) == 0) {
01582 fprintf(stderr,"Error: no tagged byte (HCodeword_TaggedGetNext)\n");
01583 exit(-1); }
01584
01585 if (num_byte_left <= 0) {
01586 fprintf(stderr,"Error: no byte left (HCodeword_TaggedGetNext)\n");
01587 exit(-1); }
01588
01589 *taggedcw = 0;
01590 *taggedcw |= *s;
01591 (*lencw)=1;
01592 s++;
01593
01594 while ( (*lencw < num_byte_left) && ( ((*s) & 0x80) == 0 )) {
01595 *taggedcw = ((*taggedcw) << 8) | (*s);
01596 (*lencw)++;
01597 s++;
01598 }
01599
01600 }
01601
01602
01609 int HCodeword_TaggedGetNextLength(char *s, int num_byte_left)
01610 {
01611
01612 int lencw;
01613
01614 if (((*s) & 0x80) == 0) {
01615 fprintf(stderr,"Error: no tagged byte (HCodeword_TaggedGetNextLength)\n");
01616 exit(-1); }
01617
01618 if (num_byte_left <= 0) {
01619 fprintf(stderr,"Error: no byte left (HCodeword_TaggedGetNextLength)\n");
01620 exit(-1); }
01621
01622 lencw=1;
01623 s++;
01624
01625 while ( (lencw < num_byte_left) && ( ((*s) & 0x80) == 0 )) {
01626 lencw++;
01627 s++;
01628 }
01629
01630 return lencw;
01631 }
01632
01633
01646 void HCodeword_TaggedGetPrevious(char *s, int num_byte_left, int *taggedcw, int *lencw)
01647 {
01648
01649 if (((*s) & 0x80) == 0) {
01650 fprintf(stderr,"Error: no tagged byte (HCodeword_TaggedGetPrevious)\n");
01651 exit(-1); }
01652
01653 if (num_byte_left <= 0) {
01654 fprintf(stderr,"Error: no byte left (HCodeword_TaggedGetPrevious)\n");
01655 exit(-1); }
01656
01657 *taggedcw = 0;
01658 *lencw = 0;
01659 s--;
01660
01661 while ( (*lencw < num_byte_left) && ( ((*s) & 0x80) == 0 )) {
01662 *taggedcw = (*taggedcw) | ((*s) << 8 * (*lencw));
01663 (*lencw)++;
01664 s--;
01665 }
01666
01667
01668 *taggedcw = (*taggedcw) | ((*s) << 8 * (*lencw));
01669 (*lencw)++;
01670
01671 }
01672
01673
01674
01675
01682 void HCodeword_tostring(int taggedcw, int taggedcwlen, char **s)
01683 {
01684
01685 int i = taggedcwlen;
01686 *s = (char *) malloc((taggedcwlen+1) * sizeof(char));
01687
01688
01689 if((*s == NULL) || (taggedcwlen < 1) || (taggedcwlen > 4)){
01690 fprintf(stderr,"Error: HCodeword_tostring\n");
01691 exit(-1); }
01692
01693
01694
01695
01696 while (i >= 0) {
01697 (*s)[taggedcwlen-i]=(unsigned char) ((taggedcw >> (8 * (i-1))) & 0xFF);
01698 i--;
01699 }
01700
01701
01702 (*s)[taggedcwlen]='\0';
01703
01704 }
01705
01706
01707
01708
01709
01710
01711
01712
01713
01722 void HHashtable_init(HHash_table *ht, int n)
01723 {
01724
01725 int i;
01726
01727 ht->size = (int) 1.2 * n + 13;
01728
01729 ht->table = (Hash_nodeptr_array) malloc(ht->size * sizeof(Hash_node *));
01730
01731 ht->card = 0;
01732 if (ht->table == NULL) {
01733 fprintf(stderr,"Fatal Error: Hash table allocation\n");
01734 exit(-1); }
01735
01736 for (i=0; i < ht->size; i++)
01737 ht->table[i] = NULL;
01738 }
01739
01740
01747 void HHashtable_print(HHash_table *ht)
01748 {
01749 int i,plaincw,lcw;
01750 Hash_node *p;
01751
01752 printf("\n\n================== Hash Table ====================\n");
01753 printf("Table size = %d, number of stored objects = %d\n\n",ht->size,ht->card);
01754
01755 for(i=0; i<ht->size;i++){
01756 for(p = ht->table[i]; p ; p = p->next){
01757 printf("token = \"");
01758 HPrint_string(p->str,p->len_str);
01759 printf("\"");
01760 HCodeword_PlainFromTagged(p->codeword,p->cw_len,&plaincw,&lcw);
01761 printf(" tokenlen = %d count = %d tagged cw = %x cw_len %d\n",
01762 p->len_str, p->count_occ,plaincw,lcw);
01763 }
01764 }
01765 }
01766
01767
01768
01776 int HHashtable_func(char *s, int len, HHash_table *ht)
01777 {
01778 register int hfn;
01779 int hfi;
01780 int table_size = ht->size;
01781
01782 hfn = 11;
01783 for (hfi=0; hfi<len ; hfi++)
01784 hfn = hfn ^ ((hfn<<5) + (hfn>>2) + (unsigned char) *s++);
01785 hfn = abs(hfn % table_size);
01786 return(hfn);
01787 }
01788
01789
01795 Hash_node *HHashtable_search(char *s, int slen, HHash_table *ht)
01796 {
01797 Hash_node *hsp;
01798
01799 for (hsp = ht->table[HHashtable_func(s,slen,ht)]; hsp; hsp = hsp->next)
01800 if ((slen == hsp->len_str) && (memcmp(s,hsp->str,slen) == 0))
01801 return(hsp);
01802 return((Hash_node *)0);
01803 }
01804
01805
01806
01815 int HHashtable_insert(char *s, int slen, HHash_table *ht)
01816 {
01817 int hiv;
01818 Hash_node *hip;
01819 Hash_node *table_head;
01820 Hash_node *hip_pred;
01821 char *token;
01822
01823 hiv = HHashtable_func(s,slen,ht);
01824 hip = HHashtable_search(s,slen,ht);
01825
01826 if (hip) {
01827 hip->count_occ += 1;
01828
01829
01830 if (ht->table[hiv] != hip){
01831
01832
01833
01834 for (hip_pred = ht->table[hiv]; hip_pred->next != hip;
01835 hip_pred = hip_pred->next) ;
01836
01837 hip_pred->next = hip->next;
01838 table_head = ht->table[hiv];
01839 ht->table[hiv] = hip;
01840 hip->next = table_head;
01841 }
01842
01843 return(0);
01844
01845 } else {
01846 hip = (Hash_node *) malloc(sizeof(Hash_node));
01847
01848 token = malloc(slen);
01849 memcpy(token,s,slen);
01850
01851 if (hip == NULL){
01852 fprintf(stderr,"Error: Insert hash table\n");
01853 exit(-1); }
01854
01855 hip->len_str = slen;
01856 hip->count_occ = 1;
01857 hip->str = token;
01858 hip->next = ht->table[hiv];
01859 ht->table[hiv] = hip;
01860 ht->card += 1;
01861 return(1);
01862 }
01863 }
01864
01865
01871 void HHashtable_clear(HHash_table *ht)
01872 {
01873 int i;
01874
01875 for ( i = 0; i < ht->size; i++ ) {
01876 Hash_node *hn = ht->table[i];
01877 while ( hn ) {
01878 Hash_node *toFree = hn;
01879 hn = hn->next;
01880 free(toFree->str);
01881 free(toFree);
01882 }
01883 }
01884 ht->card = 0;
01885 ht->size = 0;
01886 free(ht->table);
01887 }
01888
01889
01900 void HHashtable_fromdict(Dictionary *dict, Canonical *cano, HHash_table *ht)
01901 {
01902
01903 int token_len,i,j,l;
01904 char *token;
01905 Hash_node *p;
01906
01907
01908
01909 HHashtable_init(ht,dict->num_tokens);
01910
01911
01912 for(i=0,j=0; i<dict->length; j++){
01913
01914 i++;
01915 while (dict->content[i] != '\n')
01916 i++;
01917
01918 token_len = i - dict->start_pos[j];
01919 token = dict->content + dict->start_pos[j];
01920
01921
01922 HHashtable_insert(token,token_len,ht);
01923 if((p=HHashtable_search(token,token_len,ht)) == NULL){
01924 fprintf(stderr,"Error: token not found (HHashtable_fromdict)\n");
01925 exit(-1); }
01926
01927 HCodeword_TaggedFromTokenrank(j,cano,&(p->codeword),&l);
01928 p->cw_len = l;
01929
01930 i++;
01931 }
01932
01933 if(j != dict->num_tokens) {
01934 fprintf(stderr,"Error: num tokens in dictionary (HHashtable_fromdict)\n");
01935 exit(-1); }
01936
01937 }
01938
01939
01940
01941
01942
01943
01954 void HDictionary_fromstring(char *s, int slen, int stokens, Dictionary *dict)
01955 {
01956
01957 int i,j;
01958
01959
01960 dict->length = slen;
01961 dict->num_tokens=stokens;
01962
01963 dict->content = s;
01964 if(dict->content == NULL) {
01965 fprintf(stderr,"Error: memory allocation (HDictionary_fromstring)\n");
01966 exit(-1); }
01967
01968
01969 dict->start_pos = (int *) malloc(dict->num_tokens * sizeof(int));
01970 if(dict->start_pos == NULL) {
01971 fprintf(stderr,"Error: memory allocation (HDictionary_fromstring)\n");
01972 exit(-1); }
01973
01974
01975
01976 for(i=0,j=0; i<dict->length; j++){
01977 dict->start_pos[j] = i;
01978 i++;
01979 while (dict->content[i] != '\n')
01980 i++;
01981 i++;
01982 }
01983
01984 if(j != dict->num_tokens) {
01985 fprintf(stderr,"Error: num tokens in dictionary (HDictionary_fromstring)\n");
01986 exit(-1); }
01987
01988 }
01989
01998 void HDictionary_fromtree(Hash_nodeptr_array tree, int num_tokens, Dictionary *dict)
01999 {
02000
02001 int i,j,k;
02002
02003
02004 dict->length = 0;
02005 dict->num_tokens=num_tokens;
02006
02007 for(i=0; i<dict->num_tokens; i++)
02008 dict->length += tree[i]->len_str + 1;
02009
02010 dict->content = (char *) malloc( (dict->length + 1) * sizeof(char));
02011
02012 if(dict->content == NULL) {
02013 fprintf(stderr,"Error: memory allocation (HDictionary_fromtree)\n");
02014 exit(-1); }
02015
02016 for(i=0,j=0; i<dict->num_tokens; i++){
02017 for(k=0; k < tree[i]->len_str; k++)
02018 dict->content[j++] = tree[i]->str[k];
02019 dict->content[j++] = '\n';
02020 }
02021 dict->content[j]='\0';
02022
02023 if(j != dict->length) {
02024 fprintf(stderr,"Error: dictionary length (HDictionary_fromtree)\n");
02025 exit(-1); }
02026
02027
02028 dict->start_pos = (int *) malloc(dict->num_tokens * sizeof(int));
02029
02030 if(dict->start_pos == NULL) {
02031 fprintf(stderr,"Error: memory allocation (HDictionary_fromtree)\n");
02032 exit(-1); }
02033
02034
02035 for(i=0,j=0; i<dict->length; j++){
02036 dict->start_pos[j] = i;
02037 i++;
02038 while (dict->content[i] != '\n')
02039 i++;
02040 i++;
02041 }
02042
02043 if(j != dict->num_tokens) {
02044 fprintf(stderr,"Error: num tokens in dictionary (HDictionary_fromtree)\n");
02045 exit(-1); }
02046
02047 }
02048
02049
02058 void HDictionary_print(Dictionary *dict, HHash_table *ht, int Verbose)
02059 {
02060
02061 int j,plaincw,lcw;
02062 Hash_node *p;
02063
02064 printf("\n\n================== Dictionary ====================\n");
02065 printf("Number of tokens = %d, string length = %d\n\n",dict->num_tokens,dict->length);
02066
02067 for(j=0;j<dict->num_tokens - 1;j++){
02068 printf("\n");
02069 HPrint_string(dict->content+dict->start_pos[j],dict->start_pos[j+1]-dict->start_pos[j]-1);
02070
02071 if(Verbose) {
02072 if((p=HHashtable_search(dict->content+dict->start_pos[j],dict->start_pos[j+1]-dict->start_pos[j]-1,ht))
02073 == NULL){
02074 fprintf(stderr,"Error: token not found (HDictionary_print)\n");
02075 exit(-1); }
02076 HCodeword_PlainFromTagged(p->codeword,p->cw_len,&plaincw,&lcw);
02077 printf(":plain_cw = %x cwlen %d (bits)",plaincw,lcw);
02078 }
02079 }
02080
02081 printf("\n");
02082 HPrint_string(dict->content+dict->start_pos[j],dict->length - dict->start_pos[j]-1);
02083
02084 if(Verbose) {
02085 if((p=HHashtable_search(dict->content+dict->start_pos[j],dict->length - dict->start_pos[j]-1,ht)) == NULL){
02086 fprintf(stderr,"Error: token not found (HDictionary_print)\n");
02087 exit(-1); }
02088 HCodeword_PlainFromTagged(p->codeword,p->cw_len,&plaincw,&lcw);
02089 printf(":plain_cw = %x cwlen %d (bits)\n\n",plaincw,lcw);
02090 }
02091 }
02092
02093
02094
02095
02096
02097
02098
02099
02100
02112 void HJumpers_fromds(char *text, int tlen, HHash_table *ht,
02113 int jump_value, Jumpers *jumpers)
02114 {
02115 int text_pos,ctext_pos,i;
02116 int token_len;
02117 int taggedcw;
02118 int taggedcwlen;
02119 int current_jump;
02120
02121 jumpers->number = 0;
02122
02123 if (jump_value != 0) {
02124 jumpers->number = tlen / jump_value + 1;
02125
02126 jumpers->text_offsets = (int *) malloc(jumpers->number * sizeof(int));
02127 jumpers->ctext_offsets = (int *) malloc(jumpers->number * sizeof(int));
02128
02129 if((jumpers->text_offsets == NULL) || (jumpers->ctext_offsets == NULL)){
02130 fprintf(stderr,"Error: memory allocation (HJumpers_fromds)\n");
02131 exit(-1); }
02132
02133 current_jump = 0;
02134
02135 for(text_pos=0,ctext_pos=0,i=0; (text_pos<tlen) && (i < jumpers->number); ){
02136
02137 if (text_pos >= current_jump){
02138 if ((i==0) || (jumpers->text_offsets[i-1] != text_pos)) {
02139 jumpers->text_offsets[i] = text_pos;
02140 jumpers->ctext_offsets[i] = ctext_pos;
02141 i++;
02142 }
02143 current_jump += jump_value;
02144 }
02145
02146
02147 HToken_getLengthNext(text+text_pos,tlen-text_pos,&token_len);
02148 HCodeword_TaggedFromToken(text+text_pos,token_len,ht,&taggedcw,&taggedcwlen);
02149
02150 text_pos += token_len;
02151 ctext_pos += taggedcwlen;
02152 }
02153
02154 jumpers->number = i;
02155
02156 }
02157
02158 }
02159
02160
02161
02170 void HJumpers_tostring(char *s, Jumpers *jumpers)
02171 {
02172
02173 int i,j;
02174
02175 if(s == NULL) {
02176 fprintf(stderr,"Error: memory allocation (HJumpers_tostring)\n");
02177 exit(-1); }
02178
02179 HInt_tostring(s,jumpers->number);
02180
02181 for(i=0,j=0;i<jumpers->number;i++){
02182 HInt_tostring(s + 4 + j,jumpers->text_offsets[i]);
02183 j += 4;
02184 HInt_tostring(s + 4 + j,jumpers->ctext_offsets[i]);
02185 j += 4;
02186 }
02187
02188 }
02189
02190
02199 void HJumpers_fromstring(char *s, Jumpers *jumpers)
02200 {
02201
02202 int i;
02203
02204 jumpers->number = HInt_fromstring(s);
02205 s += 4;
02206
02207 if (jumpers->number) {
02208 jumpers->text_offsets = (int *) malloc(jumpers->number * sizeof(int));
02209 jumpers->ctext_offsets = (int *) malloc(jumpers->number * sizeof(int));
02210
02211 if((jumpers->text_offsets == NULL) || (jumpers->ctext_offsets == NULL)){
02212 fprintf(stderr,"Error: memory allocation (HJumpers_fromstring)\n");
02213 exit(-1); }
02214
02215 for(i=0;i<jumpers->number;i++){
02216 jumpers->text_offsets[i]=HInt_fromstring(s);
02217 s += 4;
02218 jumpers->ctext_offsets[i]=HInt_fromstring(s);
02219 s += 4;
02220 }
02221 } else {
02222 jumpers->text_offsets = NULL;
02223 jumpers->ctext_offsets = NULL;
02224 }
02225 }
02226
02227
02239 void Get_charpos_from_bytepos(char *ctext, int ctext_len, int bytepos, int *textpos, Console *console)
02240 {
02241
02242 int i;
02243 int current_ctextpos;
02244 char *token;
02245 int ltoken,lencw;
02246
02247 if((ctext_len <= bytepos) || (bytepos < 0)){
02248 fprintf(stderr,"Error: uncorrect bytepos (Get_charpos_from_bytepos)\n");
02249 exit(-1); }
02250
02251 if((ctext[bytepos] & 0x80) == 0){
02252 fprintf(stderr,"Error: no codeword start (Get_charpos_from_bytepos)\n");
02253 exit(-1); }
02254
02255
02256 for(i=0; (i < console->jumpers.number) && (console->jumpers.ctext_offsets[i]<= bytepos); i++) ;
02257
02258 current_ctextpos = console->jumpers.ctext_offsets[i-1];
02259 *textpos = console->jumpers.text_offsets[i-1];
02260
02261 for( ; current_ctextpos < bytepos; ) {
02262
02263 HToken_decompressNext(ctext+current_ctextpos,ctext_len-current_ctextpos+1,
02264 &token,<oken,&lencw,console);
02265 current_ctextpos += lencw;
02266 *textpos += ltoken;
02267
02268 free(token);
02269 }
02270
02271 if(current_ctextpos != bytepos){
02272 fprintf(stderr,"Error: no codeword start (Get_charpos_from_bytepos)\n");
02273 exit(-1); }
02274
02275
02276 }
02277
02278
02291 void Get_bytepos_from_charpos(char *ctext, int ctext_len, int textpos, int *bytepos, Console *console)
02292 {
02293
02294 int i;
02295 int current_textpos;
02296 char *token;
02297 int ltoken,lencw;
02298
02299
02300 if(textpos < 0){
02301 fprintf(stderr,"Error: negative text pos (From_textpos_to_ctextpos)\n");
02302 exit(-1); }
02303
02304
02305 for(i=0; (i < console->jumpers.number) && (console->jumpers.text_offsets[i]<= textpos); i++) ;
02306
02307 current_textpos = console->jumpers.text_offsets[i-1];
02308 *bytepos = console->jumpers.ctext_offsets[i-1];
02309
02310 for( ; (current_textpos < textpos) && (*bytepos < ctext_len); ) {
02311
02312 HToken_decompressNext(ctext + *bytepos, ctext_len - *bytepos + 1,&token,<oken,&lencw,console);
02313 current_textpos += ltoken;
02314 *bytepos += lencw;
02315
02316 free(token);
02317 }
02318
02319 if(textpos != current_textpos){
02320 fprintf(stderr,"Error: no token beginning (From_textpos_to_ctextpos)\n");
02321 exit(-1); }
02322
02323
02324 }
02325
02326
02327
02328
02329
02330
02331
02332
02333
02334
02335
02336
02347 void HCanonical_fromtree(Hash_nodeptr_array tree_array, Canonical *cano, HHash_table *ht)
02348 {
02349 int i,j,tmp;
02350 int numcw[5];
02351 int HT_leaves;
02352 int HT_dummy_leaves, HT_nondummy_leaves;
02353
02354
02355 HT_dummy_leaves = 128 - (ht->card % 127);
02356 HT_nondummy_leaves = ht->card;
02357 HT_leaves = HT_nondummy_leaves + HT_dummy_leaves;
02358
02359
02360 for(i=4;i>0;i--){
02361 numcw[i] = 0;
02362 cano->offsetcw[i] = 0;
02363 cano->firstcw[i] = 0;
02364 }
02365
02366
02367
02368
02369
02370
02371 for(i=0;i<HT_nondummy_leaves;i++)
02372 numcw[tree_array[i]->cw_len]++;
02373
02374
02375 for(cano->max_cwlen = 4 ; numcw[cano->max_cwlen] == 0; (cano->max_cwlen)--)
02376 ;
02377
02378
02379
02380
02381
02382
02383
02384
02385
02386 cano->offsetcw[cano->max_cwlen] = 0;
02387 cano->firstcw[cano->max_cwlen] = HT_dummy_leaves;
02388
02389 for(i=cano->max_cwlen - 1;i>0;i--){
02390 cano->offsetcw[i] = cano->offsetcw[i+1]+numcw[i+1];
02391 cano->firstcw[i] = (cano->firstcw[i+1]+numcw[i+1]) >> 7;
02392 }
02393
02394
02395 cano->num_tokens=ht->card;
02396
02397
02398
02399
02400
02401
02402
02403
02404
02405
02406
02407 cano->firstcw[cano->max_cwlen] = 0;
02408 for(i=4;i>0;i--)
02409 if (numcw[i] == 0) cano->offsetcw[i] = HT_leaves+1;
02410
02411
02412 for(i=0,j=cano->max_cwlen; i<cano->num_tokens; i++){
02413
02414
02415 if((j>1) && (cano->offsetcw[j-1]<=i)) j--;
02416
02417
02418 HCodeword_TaggedFromPlain(cano->firstcw[j]+(i-cano->offsetcw[j]),j*7,&(tree_array[i]->codeword),&tmp);
02419 tree_array[i]->cw_len = j;
02420 }
02421
02422 }
02423
02424
02425
02426
02436 void HCanonical_fromstring(char *s, Canonical *cano)
02437 {
02438
02439 int i;
02440
02441 cano->max_cwlen = HInt_fromstring(s);
02442 s += 4;
02443
02444 for(i=1;i<5;i++){
02445 cano->firstcw[i]=HInt_fromstring(s);
02446 s += 4;
02447 cano->offsetcw[i]=HInt_fromstring(s);
02448 s += 4;
02449 }
02450
02451 }
02452
02463 void HCanonical_tostring(char *s, Canonical *cano)
02464 {
02465
02466 int i;
02467
02468 HInt_tostring(s,cano->max_cwlen);
02469 s += 4;
02470
02471 for(i=1;i<5;i++){
02472 HInt_tostring(s,cano->firstcw[i]);
02473 s += 4;
02474 HInt_tostring(s,cano->offsetcw[i]);
02475 s += 4;
02476 }
02477
02478 }
02479
02480
02481
02482
02483
02484
02485
02486
02487
02499 void Hufftree_createLeaves(Hash_nodeptr_array tree, int num_leaves, HHash_table *ht)
02500 {
02501 Hash_node *p;
02502 int i,j;
02503
02504
02505 for(i=0,j=0; i<ht->card;j++){
02506 for(p = ht->table[j]; p ; p = p->next)
02507 tree[i++] = p;
02508 }
02509
02510 for(i=ht->card; i<num_leaves; i++){
02511 if ((tree[i] = (Hash_node *) malloc(sizeof(Hash_node))) == NULL){
02512 fprintf(stderr,"Error: Hufftree_createLeaves\n");
02513 exit(-1);
02514 } else {
02515 tree[i]->str = NULL;
02516 tree[i]->len_str = 0;
02517 tree[i]->count_occ = 0;
02518 }
02519 }
02520
02521
02522 qsort(tree,num_leaves,sizeof(Hash_node *),HSort_for_freq);
02523
02524 }
02525
02526
02540 int Hufftree_fromLeaves(Hash_nodeptr_array work_area, int *tot_nodes, int leaves)
02541 {
02542 int num_leaves,i,count,chosen_leaves;
02543 int processed, head, tail;
02544 Hash_node *p;
02545 Hash_nodeptr_array Node_queue;
02546
02547
02548
02549 processed = (leaves + (leaves-1)/127 + 1) -1;
02550 head = 0;
02551 tail = 0;
02552 chosen_leaves = 1;
02553 Node_queue=(Hash_nodeptr_array )malloc(sizeof(Hash_node *)*((leaves-1)/127+1));
02554
02555 if(Node_queue == NULL) {
02556 fprintf(stderr,"Out of mem: Node_queue creation in Hufftree_fromLeaves\n");
02557 exit(-1); }
02558
02559 for(num_leaves = leaves; (num_leaves > 0) || (head != tail); ){
02560
02561
02562 for(i = 0,count = 0; i < 128; i++) {
02563
02564 if (head == tail)
02565 chosen_leaves = 1;
02566
02567 if(num_leaves == 0)
02568 chosen_leaves = 0;
02569
02570 if( (num_leaves > 0) && (head != tail)) {
02571
02572
02573 if(work_area[num_leaves-1]->count_occ <= Node_queue[head]->count_occ)
02574 chosen_leaves = 1;
02575 else
02576 chosen_leaves = 0;
02577 }
02578
02579
02580 if(chosen_leaves == 0){
02581 work_area[processed] = Node_queue[head];
02582 count += Node_queue[head]->count_occ;
02583 head++;
02584 processed--;
02585 } else {
02586 work_area[processed]=work_area[num_leaves-1];
02587 count += work_area[num_leaves-1]->count_occ;
02588 processed--;
02589 num_leaves--; }
02590 }
02591
02592
02593 if ((p = (Hash_node *) malloc(sizeof(Hash_node))) == NULL){
02594 fprintf(stderr,"Error: Token parsing (Hufftree_fromLeaves)\n");
02595 exit(-1); }
02596 p->str = NULL;
02597 p->len_str = 0;
02598 p->count_occ = count;
02599 p->codeword = 0;
02600 p->cw_len = 0;
02601 p->next = (Hash_node *) (processed + 1);
02602
02603
02604 if((num_leaves > 0) || (head != tail))
02605 Node_queue[tail++] = p;
02606 else
02607 work_area[processed] = p;
02608 }
02609
02610 *tot_nodes = leaves + (leaves-1)/127+1 - processed;
02611 free(Node_queue);
02612
02613
02614 return(processed);
02615 }
02616
02617
02633 int Hufftree_computeCwLen(Hash_nodeptr_array tree, int root, int tree_size)
02634 {
02635 int offset_child,isleaf;
02636 int i,j;
02637 int max_length;
02638
02639 max_length =0;
02640
02641
02642 for(i=0; i<tree_size; i++) {
02643
02644
02645 offset_child = (int) (tree[root+i]->next);
02646
02647
02648 isleaf = (tree[root+i]->str != NULL) || (tree[root+i]->count_occ == 0);
02649
02650
02651 if (isleaf == 0){
02652 for(j=0; j<128; j++){
02653 if(tree[offset_child+j]->count_occ == 0){
02654 tree[offset_child+j]->cw_len = -1;
02655 } else {
02656 tree[offset_child+j]->cw_len = (tree[root+i]->cw_len) + 1;
02657 }
02658
02659
02660
02661 if(tree[offset_child+j]->cw_len > max_length)
02662 max_length = tree[offset_child+j]->cw_len;
02663 }
02664
02665 tree[root+i]->cw_len = -1;
02666 }
02667 }
02668
02669
02670
02671
02672 qsort(tree+root,tree_size,sizeof(Hash_node *),HSort_for_cwlen);
02673
02674 return max_length;
02675 }
02676
02677
02688 void Hufftree_build(Hash_nodeptr_array *tree_array_ptr, HHash_table *ht)
02689 {
02690 Hash_node *p;
02691 int i,depth;
02692 int Huff_root;
02693 int HT_size, HT_leaves;
02694 int HT_dummy_leaves, HT_nondummy_leaves;
02695
02696
02697 HT_dummy_leaves = 128 - (ht->card % 127);
02698 HT_nondummy_leaves = ht->card;
02699 HT_leaves = HT_nondummy_leaves + HT_dummy_leaves;
02700
02701 depth = 5;
02702
02703
02704
02705 (*tree_array_ptr) = (Hash_nodeptr_array )malloc(sizeof(Hash_node *)*
02706 (HT_leaves + (HT_leaves-1)/127+5));
02707
02708 if(*tree_array_ptr == NULL) {
02709 fprintf(stderr,"Error: tree_array_ptr (Hufftree_build)\n");
02710 exit(-1); }
02711
02712 while (depth > 4) {
02713
02714
02715
02716 Hufftree_createLeaves(*tree_array_ptr,HT_leaves,ht);
02717
02718
02719
02720
02721 Huff_root = Hufftree_fromLeaves(*tree_array_ptr,&HT_size,HT_leaves);
02722
02723
02724
02725
02726 depth = Hufftree_computeCwLen(*tree_array_ptr,Huff_root,HT_size);
02727
02728 if(depth > 4) {
02729
02730 for(i=0;i<HT_nondummy_leaves;i++){
02731 p = (*tree_array_ptr)[i+Huff_root];
02732 p->count_occ = p->count_occ / 1.618 + 1;
02733 }
02734 }
02735 }
02736
02737
02738 *tree_array_ptr += Huff_root;
02739
02740 }
02741
02742
02751 void Hufftree_print(Canonical *cano, Hash_nodeptr_array tree_array, int Verbose)
02752 {
02753 int i,j;
02754 int plaincw,lcw;
02755
02756 printf("\n\n================== Huffman Tree ====================\n");
02757
02758 printf("Number of distinct tokens %d\n",cano->num_tokens);
02759 printf("Max codeword length = %d (in groups of 7 bits)\n",cano->max_cwlen);
02760 for(i=cano->max_cwlen; i>0;i--)
02761 printf("Offset[%d] = %d, First_cw[%d] = %x\n",i,cano->offsetcw[i],i,cano->firstcw[i]);
02762
02763 j=cano->max_cwlen;
02764
02765 if(Verbose) {
02766
02767 printf("\n ----- tokens -----\n\n");
02768 for(i=0; i<cano->num_tokens; i++){
02769
02770
02771 if((j>1) && (cano->offsetcw[j-1]<=i)) j--;
02772
02773
02774 printf("token = \"");
02775 HPrint_string(tree_array[i]->str,tree_array[i]->len_str);
02776 printf("\"");
02777
02778 HCodeword_PlainFromTagged(tree_array[i]->codeword,tree_array[i]->cw_len,&plaincw,&lcw);
02779 printf(" token_len = %d, cwlen = %d (bits), untagged_cw = %x\n", tree_array[i]->len_str,plaincw,lcw);
02780 }
02781 }
02782
02783 }
02784
02785
02786
02787
02788
02789
02790
02791
02792
02793
02794
02795
02796
02803 void HPrint_string(char *s, int l)
02804 {
02805
02806 int i;
02807
02808 if (l<0){
02809 fprintf(stderr,"Error: negative string length (HPrint_string)\n");
02810 exit(-1); }
02811
02812 for(i=0; i<l; i++){
02813 if(!isprint(s[i])){ printf("[%d]",s[i]); }
02814 else printf("%c",s[i]);
02815 }
02816 }
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840
02841
02842
02843
02848 void HInt_tostring(char *s, int i)
02849 {
02850 int j;
02851
02852 for(j=3; j>=0; j--)
02853 s[3-j] = (char) ((i >> (8 * j)) & 0xff);
02854 }
02855
02859 int HInt_fromstring(char *s)
02860 {
02861 int j,num;
02862
02863 num=0;
02864
02865 for(j=0; j<4; j++)
02866 num = (num << 8) | (s[j] & 0xff);
02867
02868 return(num);
02869 }
02870
02871
02872
02873
02874
02875
02876
02877
02878 int HSort_for_freq(const void *va, const void *vb) {
02879 return(((Hash_nodeptr_array ) vb)[0]->count_occ - ((Hash_nodeptr_array ) va)[0]->count_occ);
02880 }
02881
02882
02883 int HSort_for_cwlen(const void *va, const void *vb) {
02884
02885
02886 if ((((Hash_nodeptr_array ) vb)[0]->cw_len - ((Hash_nodeptr_array ) va)[0]->cw_len) != 0)
02887 return((((Hash_nodeptr_array ) vb)[0]->cw_len - ((Hash_nodeptr_array ) va)[0]->cw_len));
02888
02889
02890 if(((Hash_nodeptr_array ) vb)[0]->str == NULL)
02891 return(-1);
02892 if(((Hash_nodeptr_array ) va)[0]->str == NULL)
02893 return(1);
02894 return(memcmp(((Hash_nodeptr_array ) va)[0]->str,((Hash_nodeptr_array ) vb)[0]->str,
02895 ((Hash_nodeptr_array ) va)[0]->len_str));
02896 }
02897
02898