Main Page   Alphabetical List   Compound List   File List   Compound Members   File Members  

HuffwordLib.c

Go to the documentation of this file.
00001 
00018 /* -------------------------------------------------------- */
00019 /* ---------------  Pull of data structures --------------- */
00020 /* -------------------------------------------------------- */
00021 
00022 
00023 #include "HuffwordLib.h"
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 /* -------------------------------------------------------------- */
00032 /* -------------------- High-level procedures ------------------- */
00033 /* --------------------                       ------------------- */
00034 /* --------------------  Text (de)compression ------------------- */
00035 /* -------------------------------------------------------------- */
00036 
00037 
00050 void Huffw_compress(char *text, int tlen, int jump_value, char **cstring, int *clen, int Verbose)
00051 {
00052 
00053   char *body;
00054   int blen;
00055   Console c;
00056 
00057 
00058   if ((tlen <= 0) || (text == NULL)){
00059     fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (Huffw_compress)\n");
00060     exit(-1); }
00061     
00062 
00063   // Initializes the hastable then used by HParse_text()
00064   HHashtable_init(&(c.hashtable),tlen/1000+5);
00065   
00066   HParse_text(text,tlen,&(c.hashtable));  
00067 
00068   HCompress_getBodyAndConsole(text,tlen,jump_value,&body,&blen,&c);
00069 
00070   HCompress_getString(cstring,clen,tlen,body,blen,&c);
00071 
00072   free(body);
00073 
00074   // The first byte is set to 0 for the Classic model
00075   **cstring = 0;
00076 
00077   if(Verbose)
00078     Huffw_PrintInfo(&c,blen,tlen,*clen,0,Verbose); // Parsing rule = 0
00079 
00080 }
00081 
00082 
00093 void Huffw_decompress(char *cstring, int clen, char **text, int *tlen, int Verbose)
00094 {
00095   Console c;
00096   char *body;
00097   int blen;
00098 
00099   if ((clen <= 0) || (cstring == NULL)){
00100     fprintf(stderr,"Fatal Error: You're trying to decompress an empty file! (Huffw_decompress)\n");
00101     exit(-1); }
00102 
00103   // Checks if the file has been compressed with Classic model
00104   if(*cstring != 0){
00105     fprintf(stderr,"Fatal Error: First byte of compressed file not 0, actually %d\n", *cstring);
00106     exit(-1); }
00107 
00108 
00109   HDecompress_getBodyAndConsole(cstring,clen,tlen,&body,&blen,&c);
00110 
00111   HDecompress_nextBlock_bytes(body,blen,text,tlen, &c);
00112 
00113   if(Verbose)
00114     Huffw_PrintInfo(&c,blen,*tlen,clen,0,Verbose); // Parsing rule = 0
00115 
00116 }
00117 
00118 
00119 
00120 
00134 void Huffw_spaceless_compress(char *text, int tlen, char **cstring, int *clen, int Verbose)
00135 {
00136 
00137   Console c;
00138   char *body, *body_spaceless;
00139   int blen, blen_spaceless;
00140   int jump_value = 0; // it is forced to be 0
00141 
00142   if ((tlen <= 0) || (text == NULL)){
00143     fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (Huffw_spaceless_compress)\n");
00144     exit(-1); }
00145 
00146   // Initializes the hastable then used by HParse_text()
00147   HHashtable_init(&(c.hashtable),tlen/1000+5);
00148   
00149   HParse_text(text,tlen,&(c.hashtable));  
00150 
00151   HCompress_getBodyAndConsole(text,tlen,jump_value,&body,&blen,&c);
00152 
00153   HCompress_contractSpaces(body,blen,&body_spaceless, &blen_spaceless, &c);
00154   
00155   free(body);
00156 
00157   HCompress_getString(cstring,clen,tlen,body_spaceless,blen_spaceless,&c);
00158 
00159   free(body_spaceless);
00160 
00161   // The first reserved byte is set to 1 for the Spaceless model 
00162   **cstring = 1;
00163 
00164   if(Verbose)
00165     Huffw_PrintInfo(&c,blen_spaceless,tlen,*clen,1,Verbose); // Parsing rule = 1
00166 
00167 }
00168 
00169 
00170 
00171 
00183 void Huffw_spaceless_decompress(char *cstring, int clen, char **text, int *tlen, int Verbose)
00184 {
00185   Console c;
00186   char *body, *body_spaceless;
00187   int blen, blen_spaceless;
00188 
00189   if ((clen <= 0) || (cstring == NULL)){
00190     fprintf(stderr,"Fatal Error: You're trying to decompress an empty file! (Huffw_spaceless_decompress)\n");
00191     exit(-1); }
00192 
00193 
00194   // Checks if the file has been compressed with the Spaceless model
00195   if(*cstring != 1){
00196     fprintf(stderr,"Fatal Error: First byte of compressed file not 1, actually %d\n", *cstring);
00197     exit(-1); }
00198 
00199   // the jump_value is set to be 0 for the Spaceless model 
00200   HDecompress_getBodyAndConsole(cstring,clen,tlen,&body_spaceless,&blen_spaceless,&c);
00201 
00202   HDecompress_expandSpaces(body_spaceless, blen_spaceless, &body, &blen,&c);
00203   
00204   HDecompress_nextBlock_bytes(body,blen,text,tlen,&c);
00205 
00206   if(Verbose)
00207     Huffw_PrintInfo(&c,blen_spaceless,*tlen,clen,1,Verbose); // Parsing rule = 1
00208 
00209 }
00210 
00211 
00212 
00225 void Huffw_PrintInfo(Console *c, int ctext_len, int text_len, int cstring_len, int rule, int Verbose)
00226 {
00227   int i,j;
00228   float f = ((float)cstring_len)/((float)text_len);
00229 
00230   if (c->jumpers.number){
00231     j = text_len/c->jumpers.number;
00232   } else { j=0; }
00233   printf("\n------------------- Global infos --------------------\n\n");
00234   printf("Parsing rule = %d\n", rule);
00235   printf("Text length = %d\n", text_len);
00236   printf("Compressed body length = %d\n", ctext_len);
00237   printf("Dictionary size = %d, Dictionary length = %d\n",c->dictionary.num_tokens,c->dictionary.length);
00238   printf("Max codeword length = %d\n",c->canoinfo.max_cwlen);
00239   printf("Number of jumpers = %d, average jump size = %d chars\n\n", c->jumpers.number,j);
00240   for(i=c->canoinfo.max_cwlen; i>0;i--)
00241     printf("Offset[%d] = %d, First_cw[%d] = %x\n",i,c->canoinfo.offsetcw[i],i,c->canoinfo.firstcw[i]);
00242   printf("\n Overall compressed file length = %d, compression ratio (percentage) = %.1f \n\n", 
00243          cstring_len, f * 100);
00244 
00245   if (Verbose > 1)
00246     HDictionary_print(&(c->dictionary), &(c->hashtable), 1);
00247 
00248 }
00249 
00250 
00251 
00252 /* ------------------------------------------------------ */
00253 /* --------------------  Text parsing ------------------- */
00254 /* ------------------------------------------------------ */
00255 
00256 
00257 
00273 void HParse_text(char *text, int text_len, HHash_table *ht)
00274 {
00275   int token_len,i;
00276   char *token;
00277 
00278 
00279   if ((text_len <= 0) || (text == NULL)){
00280     fprintf(stderr,"Fatal Error: You're trying to parse an empty file! (HParse_text)\n");
00281     exit(-1); }
00282 
00283   for(i=0; i<text_len; i+=token_len){
00284 
00285     // get the next token from the input
00286     HToken_getLengthNext(text+i,text_len-i,&token_len);
00287     token = text+i;
00288 
00289     // Insert token into hash table and count if it is new
00290     HHashtable_insert(token,token_len,ht);   
00291   }
00292 
00293 }
00294 
00295 
00296 
00297 
00298 /* -------------------------------------------------------------- */
00299 /* --------------------  Refined procedures   ------------------- */
00300 /* --------------------                       ------------------- */
00301 /* --------------------  Text (de)compression ------------------- */
00302 /* -------------------------------------------------------------- */
00303 
00304 
00305 
00317 void HCompress_getBody(char *text, int text_len, HHash_table *ht, char **ctext, int *ctext_len)
00318 
00319 {
00320   int i;
00321   int token_len, ctext_allocated;
00322   char *token;
00323   int taggedcw;
00324   int taggedcwlen;
00325   char *s;
00326   float resize_factor = 1.4;  // 40% more space
00327 
00328 
00329   if ((text_len <= 0) || (text == NULL)){
00330     fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_getBody)\n");
00331     exit(-1); }
00332 
00333 
00334   *ctext_len=0;
00335   ctext_allocated = text_len/2;
00336   *ctext = (char *) malloc((ctext_allocated+1) * sizeof(char));
00337 
00338   if((*ctext == NULL) || (text_len == 0)){
00339     fprintf(stderr,"Error: memory allocation (HCompress_getBody)\n");
00340     exit(-1);  }
00341 
00342   for(i=0; i<text_len; i+=token_len){
00343     
00344     // get the next token from the input
00345     token = text+i;
00346     HToken_getLengthNext(text+i,text_len-i,&token_len);
00347 
00348     // compute its tagged codeword
00349     HCodeword_TaggedFromToken(token,token_len,ht,&taggedcw,&taggedcwlen);
00350 
00351     // Writes the codeword into the body string
00352     HCodeword_tostring(taggedcw,taggedcwlen,&s);
00353 
00354     // Resize the space allocated to the compressed text, if needed
00355     if( (*ctext_len+taggedcwlen) >= ctext_allocated ) {
00356       *ctext = (char*) realloc(*ctext,(resize_factor*ctext_allocated + taggedcwlen + 1)*sizeof(char));
00357       if ( !(*ctext) ) { fprintf(stderr, "memory exhausted in HCompress_getBody\n"); exit(-1); }
00358       ctext_allocated = ctext_allocated * resize_factor + taggedcwlen + 1;
00359     }
00360 
00361     // Writes the tagged codeword into the compressed text
00362     memcpy((*ctext) + (*ctext_len),s,taggedcwlen);
00363     (*ctext_len) += taggedcwlen;
00364 
00365     free(s); // free the space allocated for the codeword
00366   }
00367 }
00368 
00369 
00370 
00371 
00372 
00387 void HCompress_getBodyAndConsole(char *text, int tlen, int jump_value, char **body, int *blen, 
00388                                 Console *console)
00389 {
00390 
00391   Hash_nodeptr_array tree; 
00392   Canonical *cano = &(console->canoinfo);
00393   Dictionary *dict = &(console->dictionary);
00394   Jumpers *jump = &(console->jumpers);
00395   HHash_table *ht = &(console->hashtable);
00396 
00397 
00398   if ((tlen <= 0) || (text == NULL)){
00399     fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_getBodyAndConsole)\n");
00400     exit(-1); }
00401 
00402 
00403   // computes the huffman tree
00404   Hufftree_build(&tree,ht); 
00405 
00406   // computes the canonical info and updates the hashtable determining the (plain) codewords
00407   HCanonical_fromtree(tree,cano,ht); 
00408 
00409   // computes the dictionary data structure
00410   HDictionary_fromtree(tree,cano->num_tokens,dict); 
00411 
00412   // computes the body of the compressed text 
00413   HCompress_getBody(text,tlen,ht,body,blen);
00414    
00415   // computes the jumpers according to the jump_value specified, if 0 nothing is done
00416   HJumpers_fromds(text,tlen,ht,jump_value,jump);
00417 
00418 }
00419 
00420 
00421 
00422 
00449 void HCompress_getString(char **ctext, int *ctext_len, int text_len,char *body, int blen, 
00450                         Console *console)
00451 {
00452   int global_length;
00453   Canonical *cano = &(console->canoinfo);
00454   Dictionary *dict = &(console->dictionary);
00455   Jumpers *jump = &(console->jumpers);
00456 
00457 
00458   if ((text_len <= 0) || (body == NULL) || (blen <=0)){
00459     fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_getString)\n");
00460     exit(-1); }
00461 
00462 
00463   global_length = 57 + 8 * jump->number +  dict->length + blen;
00464 
00465   *ctext = (char *) malloc(global_length * sizeof(char));
00466   *ctext_len=0;
00467 
00468   // reserved byte for storing compression rule
00469   *ctext_len += 1;
00470 
00471   HInt_tostring(*ctext + *ctext_len, text_len); 
00472   *ctext_len += 4;
00473 
00474   HInt_tostring(*ctext + *ctext_len, dict->num_tokens); 
00475   *ctext_len += 4;
00476 
00477   HCanonical_tostring(*ctext + *ctext_len, cano); 
00478   *ctext_len += 36;
00479 
00480   HJumpers_tostring(*ctext + *ctext_len, jump);
00481   *ctext_len += 8 * jump->number + 4;
00482 
00483   HInt_tostring(*ctext + *ctext_len, dict->length);  
00484   *ctext_len += 4;
00485   memcpy(*ctext + *ctext_len, dict->content, dict->length);
00486   *ctext_len += dict->length;
00487 
00488   HInt_tostring(*ctext + *ctext_len, blen);
00489   *ctext_len += 4;
00490   memcpy(*ctext + *ctext_len, body, blen);
00491   *ctext_len += blen;
00492 
00493   if (global_length != *ctext_len){
00494         fprintf(stderr,"Error: computing the compressed file length (HCompress_getString)\n");
00495         exit(-1);  }  
00496 
00497 
00498 }
00499 
00500 
00511 void HCompress_contractSpaces(char *body, int blen, char **body_spaceless, int *blen_spaceless, Console *console)
00512 {
00513   char *token, *sstring;
00514   int token_len, tcw_len, i;
00515   int spacecw, spacecw_len;
00516   char spacecw_string[4];
00517   int state, next_state;
00518 
00519 
00520   if ((body == NULL) || (blen <=0)){
00521     fprintf(stderr,"Fatal Error: You're trying to compress an empty file! (HCompress_contractSpaces)\n");
00522     exit(-1); }
00523 
00524   // Note that the dictionary contain the space if it has to be dropped
00525   if(HHashtable_search(" ",1,&(console->hashtable)) == NULL){ // No single space to be contracted
00526     *body_spaceless = body;
00527     *blen_spaceless = blen;
00528   } else {
00529 
00530     *body_spaceless = (char *) malloc(blen * sizeof(char));
00531     if (*body_spaceless == NULL) {
00532       fprintf(stderr,"Fatal Error: No memory for spaceless body (HCompress_contractSpaces\n");
00533       exit(-1); }
00534 
00535     // Get the codeword for the single space, put it into a string
00536     // Note that the dictionary contain the space if it has to be dropped
00537     HCodeword_TaggedFromToken(" ", 1, &(console->hashtable), &spacecw, &spacecw_len); 
00538     HInt_tostring(spacecw_string,spacecw);
00539     sstring = spacecw_string + (4-spacecw_len);
00540 
00541     *blen_spaceless = 0;
00542 
00543     // state of the computation: 
00544     // 0 = start
00545     // 1 = ALNUM token has been read, 
00546     // 2 = ALNUM token + space have been read
00547     // 3 = ALNUM  token + space +  ALNUM token have been read ==> drop the space.
00548     state = 0; 
00549     
00550     for(i=0; i<blen; i += tcw_len) {
00551       
00552       HToken_decompressNext(body+i,blen-i,&token,&token_len,&tcw_len,console);
00553       
00554       // Compute the next automaton status
00555       if ((state == 0) && (isalnum(*token)))
00556         next_state = 1;
00557       else if ((state == 1) && (memcmp(token," ",1) == 0) & (token_len == 1))
00558         next_state = 2;
00559       else if ((state == 1) && (isalnum(*token)))
00560         next_state = 1;
00561       else if ((state == 2) && (isalnum(*token)))
00562         next_state = 3;
00563       else next_state = 0;
00564       
00565       free(token);
00566 
00567       // Write the new body file
00568       if ((state == 2) && (next_state != 3)) { // suspended space is written out
00569         memcpy(*body_spaceless + *blen_spaceless, sstring, spacecw_len);
00570         *blen_spaceless += spacecw_len;
00571       }
00572       if (next_state != 2){ // no-singlespace token is written out
00573         memcpy(*body_spaceless + *blen_spaceless, body+i, tcw_len); 
00574         *blen_spaceless += tcw_len;
00575       }
00576       
00577       // Update the automaton statos
00578       if (next_state == 3) 
00579         next_state = 1;
00580       state = next_state;
00581       
00582     }
00583 
00584     // suspended space, at the end of the file, is written out
00585     if (state == 2) { 
00586       memcpy(*body_spaceless + *blen_spaceless, sstring, spacecw_len);
00587       *blen_spaceless += spacecw_len;
00588     }
00589   }
00590 
00591 }
00592 
00593 
00594 
00609 void HDecompress_getBodyAndConsole(char *ctext, int ctext_len, int *text_len, char **body, int *blen, 
00610                                   Console *console)
00611 {
00612   int i;
00613   int len;
00614   Canonical *cano = &(console->canoinfo);
00615   Dictionary *dict = &(console->dictionary);
00616   Jumpers *jump = &(console->jumpers);
00617   HHash_table *hashtable = &(console->hashtable);
00618 
00619 
00620 
00621   if ((ctext == NULL) || (ctext_len <=0)){
00622     fprintf(stderr,"Fatal Error: You're trying to decompress an empty file! (HDecompress_getBodyAndConsole)\n");
00623     exit(-1); }
00624 
00625 
00626   i=0;
00627 
00628   // Reserved byte for storing compression rule
00629   i += 1;
00630 
00631   *text_len = HInt_fromstring(ctext + i); 
00632   i += 4;
00633 
00634   cano->num_tokens = HInt_fromstring(ctext + i); 
00635   i += 4;
00636 
00637   HCanonical_fromstring(ctext + i, cano); 
00638   i += 36;
00639 
00640   HJumpers_fromstring(ctext + i, jump);
00641   i += 8 * jump->number + 4;
00642 
00643   len = HInt_fromstring(ctext + i);  
00644   i += 4;
00645   HDictionary_fromstring(ctext+i,len,cano->num_tokens,dict);
00646   i += dict->length;
00647 
00648   HHashtable_fromdict(dict,cano,hashtable);
00649 
00650   *blen = HInt_fromstring(ctext + i);
00651   i += 4;
00652   *body =  ctext + i;
00653   i += *blen;
00654 
00655   if (i != ctext_len){
00656         fprintf(stderr,"Error: computing the compressed file length (HDecompress_getBodyAndConsole)\n");
00657         exit(-1);  }  
00658 
00659 }
00660 
00661 
00662 
00663 
00674 void HDecompress_expandSpaces(char *body_spaceless, int blen_spaceless, char **body, int *blen, Console *console)
00675 {
00676 
00677   char *token, *sstring;
00678   int token_len, tcw_len, i;
00679   int spacecw, spacecw_len;
00680   char spacecw_string[4];
00681   int state, next_state;
00682 
00683   // To manage the dummy case
00684   if ( (body_spaceless == NULL) || (blen_spaceless <= 0) ) {
00685     *body = NULL;
00686     *blen = 0;
00687     return ;
00688   }
00689 
00690   // To manage a decompression point not aligned to a codeword
00691   *body = (char *) malloc(blen_spaceless * 2 * sizeof(char));
00692   *blen = 0;
00693   if (*body == NULL) {
00694     fprintf(stderr,"Fatal Error: No memory for body (HDecompress_expandSpaces\n");
00695     exit(-1); }
00696 
00697   // Get the codeword for the single space, put it into a string
00698   // Note that the dictionary contain the space if it has been dropped
00699   if(HHashtable_search(" ",1,&(console->hashtable)) == NULL){
00700     *body = body_spaceless;
00701     *blen = blen_spaceless;
00702   } else {
00703   
00704     HCodeword_TaggedFromToken(" ", 1, &(console->hashtable), &spacecw, &spacecw_len); 
00705     HInt_tostring(spacecw_string,spacecw);
00706     sstring = spacecw_string + (4-spacecw_len);
00707 
00708     // state of the computation: 
00709     // 0 = start
00710     // 1 = ALNUM token has been read, 
00711     // 3 = ALNUM token + ALNUM token have been read
00712     
00713     state = 0; 
00714     
00715     for(i=0; i<blen_spaceless; i += tcw_len) {
00716       
00717       HToken_decompressNext(body_spaceless+i,blen_spaceless-i,&token,&token_len,&tcw_len,console);
00718       
00719       // Compute the next automaton status
00720       if ((state == 0) && (isalnum(*token)))
00721         next_state = 1;
00722       else if ((state == 1) && (isalnum(*token)))
00723         next_state = 3;
00724       else next_state = 0;
00725       
00726       free(token);
00727 
00728       // Write the new body file
00729       if (next_state == 3) { // dropped space is written out
00730         memcpy(*body + *blen, sstring, spacecw_len);
00731         *blen += spacecw_len;
00732       }
00733       memcpy(*body + *blen, body_spaceless+i, tcw_len); 
00734       *blen += tcw_len;
00735       
00736       // Update the automaton statos
00737       if (next_state == 3)
00738         next_state = 1;
00739       state = next_state;
00740     }
00741   }
00742 }
00743 
00744 
00745 
00746 
00761 void HDecompress_nextBlock_bytes(char *ctext, int num_bytes, char **text, int *text_len, Console *console)
00762 {
00763   int i;
00764   int token_len, allocated_text;
00765   char *token;
00766   int taggedcwlen;
00767   float resize_factor = 1.4; // 40% more
00768 
00769   // To manage the dummy case
00770   if ( (ctext == NULL) || (num_bytes <= 0) ) {
00771     *text = NULL;
00772     *text_len = 0;
00773     return ;
00774   }
00775 
00776   // To manage the case of decompression point not aligned to a codeword
00777   if (((*ctext) & (0x80)) == 0){
00778     fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_bytes)\n");
00779     exit(-1);  }
00780 
00781   *text_len=0;
00782   allocated_text = 2 * num_bytes;
00783   *text = (char *) malloc(allocated_text * sizeof(char));
00784 
00785   if((*text == NULL) || (num_bytes == 0)){
00786     fprintf(stderr,"Error: memory allocation (HDecompress_nextBlock_bytes)\n");
00787     exit(-1);  }
00788 
00789   for(i=0; i<num_bytes; i+=taggedcwlen){
00790     
00791     // get next token from the compressed text
00792     HToken_decompressNext(ctext+i,num_bytes-i,&token,&token_len,&taggedcwlen,console);
00793 
00794     // Resize the space allocated to the text, if needed
00795     if( (*text_len+token_len) >= allocated_text) {
00796       *text = (char*)realloc(*text,(resize_factor*allocated_text+1+token_len)*sizeof(char));
00797       if ( !(*text) ) { fprintf(stderr, "memory exhausted in HDecompress_nextBlock_bytes\n"); exit(-1); }
00798       allocated_text = allocated_text * resize_factor + 1 + token_len;
00799     }
00800 
00801     // Writes the token into the text
00802     memcpy((*text) + (*text_len),token,token_len);
00803     (*text_len) += token_len;
00804 
00805     free(token);
00806   }
00807 
00808 }
00809 
00810 
00811 
00812 
00813 
00827 void HDecompress_nextBlock_bytes_spaceless(char *ctext, int num_bytes, char **text, 
00828                                           int *text_len, Console *console)
00829 {
00830   int exp_len;
00831   char *exp_text;
00832   
00833 
00834   // To manage the dummy case
00835   if ( (ctext == NULL) || (num_bytes <= 0) ) {
00836     *text = NULL;
00837     *text_len = 0;
00838     return ;
00839   }
00840 
00841   // To manage the case of decompression point not aligned to a codeword
00842   if (((*ctext) & (0x80)) == 0){
00843     fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_bytes_spaceless)\n");
00844     exit(-1);  }
00845 
00846   // Reinsert the spaces
00847   // Even if num_bytes is not alligned to a codeword, the procedure re-aligns it
00848    HDecompress_expandSpaces(ctext, num_bytes, &exp_text, &exp_len,console);
00849 
00850    // Decompresses enough text
00851    HDecompress_nextBlock_bytes(exp_text, exp_len, text, text_len, console);
00852 
00853 }
00854 
00855 
00856 
00857 
00858 
00859 
00875 void HDecompress_previousBlock_bytes(char *ctext, int num_bytes, char **text, int *text_len, 
00876                                      Console *console)
00877 {
00878   int i;
00879   int token_len, allocated_text;
00880   char *token;
00881   int taggedcwlen;
00882   float resize_factor = 1.4; // 40% more
00883 
00884 
00885   // To manage the dummy case
00886   if ( (ctext == NULL) || (num_bytes <= 0) ) {
00887     *text = NULL;
00888     *text_len = 0;
00889     return ;
00890   }
00891 
00892   // To manage the case of decompression point not aligned to a codeword
00893   if (((*ctext) & (0x80)) == 0){
00894     fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_bytes)\n");
00895     exit(-1);  }
00896 
00897   *text_len=0;
00898   allocated_text = 2 * num_bytes;
00899   *text = (char *) malloc(allocated_text * sizeof(char));
00900 
00901   if((*text == NULL) || (num_bytes == 0)){
00902     fprintf(stderr,"Error: memory allocation (HDecompress_previousBlock_bytes)\n");
00903     exit(-1);  }
00904 
00905   // compute the starting pos of the window
00906   for(i=0; i<num_bytes; i+=taggedcwlen){
00907     
00908     // get previous token from the compressed text
00909     HToken_decompressPrevious(ctext-i,num_bytes-i,&token,&token_len,&taggedcwlen,console);
00910 
00911     free(token);
00912 
00913   }
00914 
00915   i -= taggedcwlen;
00916 
00917   for(; i>=0; i-=taggedcwlen){
00918     
00919     // get next token from the compressed text
00920     HToken_decompressNext(ctext-i,num_bytes-i,&token,&token_len,&taggedcwlen,console);
00921 
00922     // Resize the space allocated to the text, if needed
00923     if( (*text_len+token_len) >= allocated_text) {
00924       *text = (char*)realloc(*text,(resize_factor*allocated_text+1+token_len)*sizeof(char));
00925       if ( !(*text) ) { fprintf(stderr, "memory exhausted in HDecompress_previousBlock_bytes\n"); exit(-1); }
00926       allocated_text = allocated_text * resize_factor + token_len + 1;
00927     }
00928 
00929     // Writes the token into the text
00930     memcpy((*text) + (*text_len),token,token_len);
00931     (*text_len) += token_len;
00932     
00933     free(token);
00934   }
00935 
00936 }
00937 
00938 
00952 void HDecompress_previousBlock_bytes_spaceless(char *ctext, int num_bytes, char **text, 
00953                                               int *text_len, Console *console)
00954 {
00955   int exp_len;
00956   char *exp_text;
00957   
00958 
00959   // To manage the dummy case
00960   if ( (ctext == NULL) || (num_bytes <= 0) ) {
00961     *text = NULL;
00962     *text_len = 0;
00963     return ;
00964   }
00965 
00966   if (((*ctext) & (0x80)) == 0){
00967     fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_bytes_spaceless)\n");
00968     exit(-1);  }
00969 
00970 
00971   // Aligns to a tagged byte
00972   while( ( (*(ctext-num_bytes)) & (0x80) ) == 0){
00973     num_bytes++;
00974   }
00975    
00976 
00977   // Reinsert the spaces
00978    HDecompress_expandSpaces(ctext-num_bytes, num_bytes, &exp_text, &exp_len,console);
00979 
00980    // Decompresses enough text
00981    HDecompress_nextBlock_bytes(exp_text, exp_len, text, text_len, console);
00982 
00983 }
00984 
00985 
00986 
00987 
01004 void HDecompress_nextBlock_tokens(char *ctext, int bytes_left, int num_obj, 
01005                                   char **text, int *text_len, int *decoded,
01006                                   Console *console)
01007 {
01008   int token_len;
01009   char *token;
01010   int taggedcwlen;
01011 
01012 
01013   // To manage the dummy case
01014   if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01015     *text = NULL;
01016     *text_len = 0;
01017     *decoded = 0;
01018     return ;
01019   }
01020 
01021   // To manage a decompression point not aligned to a codeword
01022   if (((*ctext) & (0x80)) == 0){
01023     fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_tokens)\n");
01024     exit(-1);  }
01025 
01026   for(*decoded=0; (*decoded < bytes_left) && (num_obj > 0); *decoded += taggedcwlen){
01027     
01028     // get next token from the compressed text
01029     HToken_decompressNext(ctext + *decoded,bytes_left-*decoded,
01030                           &token,&token_len,&taggedcwlen,console);
01031 
01032     if (isalnum(token[0])){
01033       num_obj--;
01034     }
01035 
01036     free(token);
01037   }
01038 
01039   // Decompresses just the succeeding "i" bytes that contain the "num_object" words
01040   HDecompress_nextBlock_bytes(ctext,*decoded,text,text_len,console);
01041  
01042 }
01043 
01044 
01045 
01046 
01061 void HDecompress_nextBlock_tokens_spaceless(char *ctext, int bytes_left, int num_obj, 
01062                                             char **text, int *text_len, int *decoded,
01063                                             Console *console)
01064 {
01065   int text_tmp_len,exp_len;
01066   char *text_tmp, *exp_text;
01067   
01068 
01069   // To manage the dummy case
01070   if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01071     *text = NULL;
01072     *text_len = 0;
01073     *decoded = 0;
01074     return ;
01075   }
01076 
01077   // To manage a decompression point not aligned to a codeword
01078   if (((*ctext) & (0x80)) == 0){
01079     fprintf(stderr,"Error: not a tagged byte (HDecompress_nextBlock_tokens_spaceless)\n");
01080     exit(-1);  }
01081 
01082   // Determines the window to expand (spaces are not alphanumeric tokens....)
01083   HDecompress_nextBlock_tokens(ctext,bytes_left,num_obj, 
01084                               &text_tmp,&text_tmp_len,decoded,console);
01085 
01086   // Reinsert the spaces
01087   HDecompress_expandSpaces(ctext,*decoded, &exp_text, &exp_len,console);
01088 
01089 
01090   // Decompresses just the succeeding exp_len bytes that contain the "num_object" words
01091   HDecompress_nextBlock_bytes(exp_text,exp_len,text,text_len,console);
01092  
01093 }
01094 
01095 
01096 
01097 
01098 
01115 void HDecompress_previousBlock_tokens(char *ctext, int bytes_left, int num_obj, 
01116                                      char **text, int *text_len, int *decoded,
01117                                      Console *console)
01118 {
01119   int token_len;
01120   char *token;
01121   int taggedcwlen;
01122 
01123 
01124   // To manage the dummy case
01125   if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01126     *text = NULL;
01127     *text_len = 0;
01128     *decoded = 0;
01129     return ;
01130   }
01131 
01132   // To manage a decompression point not aligned to a codeword
01133   if (((*ctext) & (0x80)) == 0){
01134     fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_tokens)\n");
01135     exit(-1);  }
01136 
01137   for(*decoded=0; (*decoded < bytes_left)  && (num_obj > 0); *decoded += taggedcwlen){
01138     
01139     // get next token from the compressed text
01140     HToken_decompressPrevious(ctext - *decoded, bytes_left - *decoded,
01141                              &token,&token_len,&taggedcwlen,console);
01142     if (isalnum(token[0])){
01143       num_obj--;
01144     }
01145 
01146     free(token);
01147   }
01148 
01149   if (((*(ctext - *decoded)) & (0x80)) ==0){
01150     fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_Tokens)\n");
01151     exit(-1);  }
01152 
01153   // Decompresses just the previous "i" bytes that contain the "num_object" words
01154   HDecompress_nextBlock_bytes(ctext - *decoded, *decoded,text,text_len,console);
01155  
01156 }
01157 
01158 
01159 
01160 
01176 void HDecompress_previousBlock_tokens_spaceless(char *ctext, int bytes_left, int num_obj, 
01177                                                char **text, int *text_len, int *decoded,
01178                                                Console *console)
01179 {
01180   int text_tmp_len, exp_len;
01181   char *text_tmp, *exp_text;
01182   
01183 
01184   // To manage the dummy case
01185   if ( (ctext == NULL) || (bytes_left <= 0) || (num_obj <= 0) ) {
01186     *text = NULL;
01187     *text_len = 0;
01188     *decoded = 0;
01189     return ;
01190   }
01191 
01192   // To manage a decompression point not aligned to a codeword
01193   if (((*ctext) & (0x80)) == 0){
01194     fprintf(stderr,"Error: not a tagged byte (HDecompress_previousBlock_tokens_spaceless)\n");
01195     exit(-1);  }
01196 
01197 
01198   // Determines the window to expand (spaces are not alphanumeric tokens....)
01199   HDecompress_previousBlock_tokens(ctext,bytes_left,num_obj, 
01200                                   &text_tmp,&text_tmp_len,decoded,console);
01201 
01202   // Reinsert the spaces
01203   HDecompress_expandSpaces(ctext-*decoded, *decoded, &exp_text, &exp_len, console);
01204 
01205 
01206   // Decompresses just the succeeding exp_len bytes that contain the "num_object" words
01207   HDecompress_nextBlock_bytes(exp_text,exp_len,text,text_len,console);
01208  
01209 }
01210 
01211 
01212 
01213 
01214 
01215 
01216 
01217 
01218 /* ------------------------------------------------------ */
01219 /* -------------------  Token operations ---------------- */
01220 /* ------------------------------------------------------ */
01221 
01222 
01223 
01224 
01235 void HToken_RankFromPlainCw(int plaincw, int plaincwlen, Canonical *cano, 
01236                                 int *token_rank) 
01237 {  
01238   int len = plaincwlen / 7;
01239 
01240   if((plaincwlen % 7) != 0){
01241     fprintf(stderr,"Error: HToken_RankFromPlainCw - length no 7-multiple\n");
01242     exit(-1);  }
01243 
01244   *token_rank = cano->offsetcw[len] + plaincw - cano->firstcw[len];
01245 
01246 }
01247 
01248 
01260 void HToken_RankFromTaggedCw(int taggedcw,int taggedcwlen,Canonical *cano,int *token_rank) 
01261 {  
01262   int plaincw,plaincwlen;
01263 
01264   if((taggedcwlen > 4) || (taggedcwlen < 1)){
01265     fprintf(stderr,"Error: HToken_RankFromTaggedCw - length no in [1,4]\n");
01266     exit(-1);  }
01267 
01268   HCodeword_PlainFromTagged(taggedcw,taggedcwlen,&plaincw,&plaincwlen);
01269 
01270   if((plaincwlen % 7) != 0){
01271     fprintf(stderr,"Error: length plain no 7-multiple (HToken_RankFromTaggedCw)\n");
01272     exit(-1);  }
01273 
01274   HToken_RankFromPlainCw(plaincw,plaincwlen,cano,token_rank);
01275 
01276 }
01277 
01278 
01279 
01290 void HToken_fromTaggedCw(int taggedcw, int taggedcwlen, 
01291                         Canonical *cano, Dictionary *dict,
01292                         char **token, int *token_len) 
01293 {  
01294   int token_rank;
01295   int token_startpos;
01296 
01297   HToken_RankFromTaggedCw(taggedcw,taggedcwlen,cano,&token_rank);
01298 
01299   token_startpos = dict->start_pos[token_rank];
01300   if ((token_startpos < 0) || (token_startpos >= dict->length)){
01301     fprintf(stderr,"Error: token_startpos computation (HToken_fromTaggedCw)\n");
01302     exit(-1);  }  
01303     
01304   *token_len=1; // jump the first char which does exist
01305 
01306   while(dict->content[token_startpos + (*token_len)] != '\n') 
01307     (*token_len)++;
01308 
01309   *token = (char *) malloc(*token_len);
01310   memcpy(*token,dict->content + token_startpos, *token_len);
01311 
01312   if(*token == NULL) {
01313     fprintf(stderr,"Error: memory allocation for token (HToken_FromTaggedCw)\n");
01314     exit(-1);  }  
01315 
01316 }
01317 
01318 
01333 int HToken_decompressNext(char *s, int num_byte_left, char **token, int *lentoken, int *lencw, 
01334                          Console *console)
01335 {
01336 
01337   int taggedcw;
01338 
01339   if(num_byte_left <= 0) return 0;
01340 
01341   // get the next tagged codeword from the compressed text
01342   HCodeword_TaggedGetNext(s,num_byte_left,&taggedcw,lencw);
01343 
01344   // compute its token
01345   HToken_fromTaggedCw(taggedcw,*lencw,&(console->canoinfo),&(console->dictionary),token,lentoken);
01346 
01347   return 1;
01348 
01349 }
01350 
01351 
01366 int HToken_decompressPrevious(char *s, int num_byte_left, char **token, int *lentoken, int *lencw,
01367                              Console *console)
01368 {
01369 
01370   int taggedcw;
01371 
01372   if(num_byte_left <= 0) return 0;
01373 
01374   // get the previous tagged codeword from the compressed text
01375   HCodeword_TaggedGetPrevious(s,num_byte_left,&taggedcw,lencw);
01376   
01377   // compute its token
01378   HToken_fromTaggedCw(taggedcw,*lencw,&(console->canoinfo),&(console->dictionary),token,lentoken);
01379 
01380   return 1;
01381 
01382 }
01383 
01384 
01385 
01397 void HToken_getLengthNext(char *s, int num_char_left, int *len)
01398 {
01399 
01400   *len = 0;
01401 
01402   if (num_char_left <= 0) {
01403     fprintf(stderr,"Error: HToken_getLengthNext\n");
01404     exit(-1); } 
01405 
01406     
01407   if (!isalnum(*s)) {   // token is a sequence of separators (not '\n') or a single '\n'
01408 
01409     while ( (*len < num_char_left) && (!isalnum(*s)) && ((*s) != '\n')) {
01410         (*len)++; s++; }
01411 
01412     // Parsed just one '\n'
01413     if (*len == 0)  *len = 1;
01414 
01415   } else { // token is a sequence of ALNUM chars
01416 
01417     while ( (*len < num_char_left) && isalnum(*s) ){
01418       (*len)++;  s++; }
01419   }
01420 
01421   if(*len <= 0){
01422     fprintf(stderr,"Error: HToken_getLengthNext\n");
01423     exit(-1);  }
01424 }
01425 
01426 
01427 
01428 /* ------------------------------------------------------ */
01429 /* -----------------  Codeword operations --------------- */
01430 /* ------------------------------------------------------ */
01431 
01432 
01443 void HCodeword_PlainFromTagged(int taggedcw, int taggedcwlen, int *plaincw, int *plaincwlen) 
01444 {  
01445   *plaincw=0;
01446   *plaincwlen=0;
01447 
01448   if((taggedcwlen > 4) || (taggedcwlen < 1)){
01449     fprintf(stderr,"Error: HCodeword_PlainFromTagged - length no in [1,4]\n");
01450     exit(-1);  }
01451   
01452   while (taggedcwlen > 0) {
01453     *plaincw += (taggedcw & 0x7F) << (*plaincwlen);
01454     taggedcw >>= 8;
01455     *plaincwlen += 7; // it is expressed in bits
01456     taggedcwlen --;  // it is expressed in bytes
01457   }
01458 
01459 }
01460 
01461 
01472 void HCodeword_PlainFromTokenrank(int token_rank,Canonical *cano,int *plaincw,int *plaincwlen) 
01473 {  
01474   int i;
01475 
01476   // codewords stored by decreasing length in the dictionary
01477   for(i=1; (i < 5) && (cano->offsetcw[i] > token_rank); i++)
01478     ;
01479 
01480   *plaincw = cano->firstcw[i] + (token_rank - cano->offsetcw[i]);
01481   *plaincwlen= 7 * i;
01482 
01483 }
01484 
01485 
01486 
01487 
01497 void HCodeword_TaggedFromPlain(int plaincw, int plaincwlen, int *taggedcw, int *taggedcwlen) 
01498 {  
01499   *taggedcw=0;
01500   *taggedcwlen=0;
01501 
01502   if((plaincwlen % 7) != 0){
01503     fprintf(stderr,"Error: HCodeword_TaggedFromPlain, len %d\n",plaincwlen);
01504     exit(-1);  }
01505 
01506   while (plaincwlen > 0) {
01507     *taggedcw |= ((plaincw & 0x7F) << (8 * (*taggedcwlen)));
01508     (*taggedcwlen)++;
01509     plaincw >>= 7;
01510     plaincwlen -=7;
01511   }
01512   
01513   *taggedcw |= (0x80 << (8 * (*taggedcwlen - 1)));
01514 }
01515 
01525 void HCodeword_TaggedFromTokenrank(int token_rank,Canonical *cano, int *taggedcw, int *taggedcwlen) 
01526 {  
01527   int plaincw,plaincwlen;
01528 
01529   HCodeword_PlainFromTokenrank(token_rank,cano,&plaincw,&plaincwlen);
01530 
01531   if((plaincwlen % 7) != 0){
01532     fprintf(stderr,"Error: HCodeword_TaggedFromTokenrank -length plain no 7-multiple\n");
01533     exit(-1);  }
01534 
01535   HCodeword_TaggedFromPlain(plaincw,plaincwlen,taggedcw,taggedcwlen);
01536 
01537   if((*taggedcwlen > 4) || (*taggedcwlen < 1)){
01538     fprintf(stderr,"Error: HCodeword_TaggedFromTokenrank - length tagged no in [1,4]\n");
01539     exit(-1);  }
01540 
01541 }
01542 
01543 
01544 
01553 void HCodeword_TaggedFromToken(char *token,int token_len,HHash_table *ht,int *taggedcw, int *taggedcwlen) 
01554 {  
01555   Hash_node *p;
01556 
01557   if((p=HHashtable_search(token,token_len,ht)) == NULL){
01558     fprintf(stderr,"Error: token not found (HCodeword_TaggedFromToken)\n");
01559     exit(-1);  }  
01560 
01561   *taggedcw = p->codeword;
01562   *taggedcwlen = p->cw_len;
01563 
01564 }
01565 
01566 
01578 void HCodeword_TaggedGetNext(char *s, int num_byte_left, int *taggedcw, int *lencw)
01579 {
01580   
01581   if (((*s) & 0x80) == 0) { // First byte is not tagged
01582     fprintf(stderr,"Error: no tagged byte (HCodeword_TaggedGetNext)\n");
01583     exit(-1); } 
01584 
01585   if (num_byte_left <= 0) {
01586     fprintf(stderr,"Error: no byte left (HCodeword_TaggedGetNext)\n");
01587     exit(-1); } 
01588 
01589   *taggedcw = 0;
01590   *taggedcw |= *s;
01591   (*lencw)=1;
01592   s++;
01593 
01594   while ( (*lencw < num_byte_left) && ( ((*s) & 0x80) == 0 )) {
01595     *taggedcw = ((*taggedcw) << 8) | (*s);
01596     (*lencw)++;
01597     s++;
01598   }
01599 
01600 }
01601 
01602 
01609 int HCodeword_TaggedGetNextLength(char *s, int num_byte_left)
01610 {
01611   
01612   int lencw;
01613 
01614   if (((*s) & 0x80) == 0) { // First byte is not tagged
01615     fprintf(stderr,"Error: no tagged byte (HCodeword_TaggedGetNextLength)\n");
01616     exit(-1); } 
01617 
01618   if (num_byte_left <= 0) { // No bytes left
01619     fprintf(stderr,"Error: no byte left (HCodeword_TaggedGetNextLength)\n");
01620     exit(-1); } 
01621 
01622   lencw=1;
01623   s++;  // skip the tagged byte
01624 
01625   while ( (lencw < num_byte_left) && ( ((*s) & 0x80) == 0 )) {
01626     lencw++;
01627     s++;
01628   }
01629 
01630   return lencw;
01631 }
01632 
01633 
01646 void HCodeword_TaggedGetPrevious(char *s, int num_byte_left, int *taggedcw, int *lencw)
01647 {
01648   
01649   if (((*s) & 0x80) == 0) { // First byte is not tagged
01650     fprintf(stderr,"Error: no tagged byte (HCodeword_TaggedGetPrevious)\n");
01651     exit(-1); } 
01652 
01653   if (num_byte_left <= 0) {
01654     fprintf(stderr,"Error: no byte left (HCodeword_TaggedGetPrevious)\n");
01655     exit(-1); } 
01656 
01657   *taggedcw = 0;
01658   *lencw = 0;
01659   s--; // go to the previous byte
01660 
01661   while ( (*lencw < num_byte_left) && ( ((*s) & 0x80) == 0 )) {
01662     *taggedcw = (*taggedcw) | ((*s) << 8 * (*lencw));
01663     (*lencw)++;
01664     s--;
01665   }
01666 
01667   // the tagged byte
01668   *taggedcw = (*taggedcw) | ((*s) << 8 * (*lencw));
01669   (*lencw)++;
01670 
01671 }
01672 
01673 
01674 
01675 
01682 void HCodeword_tostring(int taggedcw, int taggedcwlen, char **s) 
01683 {  
01684 
01685   int i = taggedcwlen;
01686   *s = (char *) malloc((taggedcwlen+1) * sizeof(char));
01687 
01688 
01689   if((*s == NULL) || (taggedcwlen < 1) || (taggedcwlen > 4)){
01690     fprintf(stderr,"Error: HCodeword_tostring\n");
01691     exit(-1);  }
01692 
01693   
01694   // Copy the codeword rightward
01695   
01696   while (i >= 0) {
01697     (*s)[taggedcwlen-i]=(unsigned char) ((taggedcw >> (8 * (i-1))) & 0xFF);
01698     i--;
01699   }  
01700 
01701   // End the string properly
01702   (*s)[taggedcwlen]='\0';
01703 
01704 }
01705 
01706 
01707 
01708 /* ---------------------------------------------------- */
01709 /* -------------------  Hash Table    ----------------- */
01710 /* ---------------------------------------------------- */
01711 
01712 
01713 
01722 void HHashtable_init(HHash_table *ht, int n)
01723 {
01724   
01725   int i;
01726 
01727   ht->size  = (int) 1.2 * n + 13;     // Load factor ~ 80%
01728 
01729   ht->table = (Hash_nodeptr_array) malloc(ht->size * sizeof(Hash_node *));
01730 
01731   ht->card = 0;
01732   if (ht->table == NULL) {
01733     fprintf(stderr,"Fatal Error: Hash table allocation\n");
01734     exit(-1); }
01735 
01736   for (i=0; i < ht->size; i++)
01737     ht->table[i] = NULL;
01738 }
01739 
01740 
01747 void HHashtable_print(HHash_table *ht)
01748 {  
01749   int i,plaincw,lcw;
01750   Hash_node *p;
01751 
01752   printf("\n\n================== Hash Table ====================\n");
01753   printf("Table size = %d, number of stored objects = %d\n\n",ht->size,ht->card);
01754 
01755   for(i=0; i<ht->size;i++){
01756     for(p = ht->table[i]; p ; p = p->next){
01757       printf("token = \"");
01758       HPrint_string(p->str,p->len_str);
01759       printf("\"");
01760       HCodeword_PlainFromTagged(p->codeword,p->cw_len,&plaincw,&lcw);
01761       printf(" tokenlen = %d count = %d tagged cw = %x cw_len %d\n", 
01762              p->len_str, p->count_occ,plaincw,lcw);
01763     }
01764   }
01765 }
01766 
01767 
01768 
01776 int HHashtable_func(char *s, int len, HHash_table *ht)
01777 {   
01778   register int hfn;
01779   int hfi;
01780   int table_size = ht->size;
01781 
01782   hfn = 11;
01783   for (hfi=0; hfi<len ; hfi++)
01784     hfn = hfn ^ ((hfn<<5) + (hfn>>2) + (unsigned char) *s++);
01785   hfn = abs(hfn % table_size);
01786   return(hfn);
01787 }
01788 
01789 
01795 Hash_node *HHashtable_search(char *s, int slen, HHash_table *ht)
01796 {   
01797   Hash_node *hsp;  
01798 
01799   for (hsp = ht->table[HHashtable_func(s,slen,ht)]; hsp; hsp = hsp->next)
01800     if ((slen == hsp->len_str)  && (memcmp(s,hsp->str,slen) == 0)) 
01801         return(hsp);
01802   return((Hash_node *)0);
01803 }
01804 
01805 
01806 
01815 int HHashtable_insert(char *s, int slen, HHash_table *ht)   
01816 {   
01817   int hiv;
01818   Hash_node *hip;
01819   Hash_node *table_head;
01820   Hash_node *hip_pred;
01821   char *token;
01822 
01823   hiv = HHashtable_func(s,slen,ht);  // compute the hash value
01824   hip = HHashtable_search(s,slen,ht);   // check string occurrence
01825 
01826   if (hip) {
01827     hip->count_occ += 1;
01828     
01829     // move-to-front the searched token
01830     if (ht->table[hiv] != hip){
01831 
01832       // Find hip predecessor which surely does exist
01833       // because hip is not the head of the list
01834       for (hip_pred = ht->table[hiv]; hip_pred->next != hip; 
01835            hip_pred = hip_pred->next) ;
01836 
01837       hip_pred->next = hip->next;   // jump hip in the old list
01838       table_head = ht->table[hiv];   // save the old head of the list-entry
01839       ht->table[hiv] = hip;        // hip is the new head of the list
01840       hip->next = table_head;         
01841     }
01842 
01843     return(0);
01844 
01845   } else {    //------ The token is new -------
01846     hip = (Hash_node *) malloc(sizeof(Hash_node));
01847 
01848     token = malloc(slen);
01849     memcpy(token,s,slen);
01850 
01851     if (hip == NULL){
01852       fprintf(stderr,"Error: Insert hash table\n");
01853           exit(-1); } 
01854 
01855     hip->len_str = slen;
01856     hip->count_occ = 1;
01857     hip->str = token;
01858     hip->next = ht->table[hiv];
01859     ht->table[hiv] = hip;
01860     ht->card += 1;
01861     return(1);
01862   }
01863 }
01864 
01865 
01871 void HHashtable_clear(HHash_table *ht)
01872 {
01873   int i;
01874 
01875   for ( i = 0; i < ht->size; i++ ) {
01876     Hash_node *hn = ht->table[i];
01877     while ( hn ) {
01878       Hash_node *toFree = hn;
01879       hn = hn->next;
01880       free(toFree->str);
01881       free(toFree);
01882     }
01883   }
01884   ht->card = 0;
01885   ht->size = 0;
01886   free(ht->table);
01887 }
01888 
01889 
01900 void HHashtable_fromdict(Dictionary *dict, Canonical *cano, HHash_table *ht)
01901 {
01902 
01903   int token_len,i,j,l;
01904   char *token;
01905   Hash_node *p;
01906 
01907 
01908   // Creates the hash table for the tokens and their infos
01909   HHashtable_init(ht,dict->num_tokens);
01910 
01911   // Initializes the data structures
01912   for(i=0,j=0; i<dict->length; j++){
01913 
01914     i++;
01915     while (dict->content[i] != '\n')
01916       i++;
01917     // this is the token
01918     token_len = i -  dict->start_pos[j];
01919     token = dict->content + dict->start_pos[j];  // no memory allocation
01920 
01921     // Update the hash table
01922     HHashtable_insert(token,token_len,ht);
01923     if((p=HHashtable_search(token,token_len,ht)) == NULL){
01924       fprintf(stderr,"Error: token not found (HHashtable_fromdict)\n");
01925       exit(-1);  }
01926 
01927     HCodeword_TaggedFromTokenrank(j,cano,&(p->codeword),&l);
01928     p->cw_len = l; //since we store #bytes
01929 
01930     i++;
01931   }
01932 
01933   if(j != dict->num_tokens) {
01934     fprintf(stderr,"Error: num tokens in dictionary (HHashtable_fromdict)\n");
01935     exit(-1);  }
01936 
01937 }
01938 
01939 
01940 /* ------------------------------------------------------ */
01941 /* ---------------------  Dictionary  ------------------- */
01942 /* ------------------------------------------------------ */
01943 
01954 void HDictionary_fromstring(char *s, int slen, int stokens, Dictionary *dict)
01955 {
01956 
01957   int i,j;
01958 
01959 
01960   dict->length = slen;
01961   dict->num_tokens=stokens;
01962 
01963   dict->content = s;
01964   if(dict->content == NULL) {
01965     fprintf(stderr,"Error: memory allocation (HDictionary_fromstring)\n");
01966     exit(-1);  }  
01967 
01968   // Creates the array for the starting positions
01969   dict->start_pos = (int *) malloc(dict->num_tokens * sizeof(int));
01970   if(dict->start_pos == NULL) {
01971     fprintf(stderr,"Error: memory allocation (HDictionary_fromstring)\n");
01972     exit(-1);  }
01973 
01974 
01975   // Initializes the data structures
01976   for(i=0,j=0; i<dict->length; j++){
01977     dict->start_pos[j] = i;
01978     i++;
01979     while (dict->content[i] != '\n')
01980       i++;
01981     i++;
01982   }
01983 
01984   if(j != dict->num_tokens) {
01985     fprintf(stderr,"Error: num tokens in dictionary (HDictionary_fromstring)\n");
01986     exit(-1);  }
01987 
01988 }
01989 
01998 void HDictionary_fromtree(Hash_nodeptr_array tree, int num_tokens, Dictionary *dict)
01999 {
02000 
02001   int i,j,k;
02002 
02003 
02004   dict->length = 0;
02005   dict->num_tokens=num_tokens;
02006   
02007   for(i=0; i<dict->num_tokens; i++)
02008     dict->length +=  tree[i]->len_str + 1;
02009 
02010   dict->content = (char *) malloc( (dict->length + 1) * sizeof(char));
02011 
02012   if(dict->content == NULL) {
02013     fprintf(stderr,"Error: memory allocation (HDictionary_fromtree)\n");
02014     exit(-1);  }  
02015 
02016   for(i=0,j=0; i<dict->num_tokens; i++){
02017     for(k=0; k < tree[i]->len_str; k++) 
02018       dict->content[j++] = tree[i]->str[k]; 
02019     dict->content[j++] = '\n';
02020   }
02021   dict->content[j]='\0';
02022 
02023   if(j != dict->length) {
02024     fprintf(stderr,"Error: dictionary length (HDictionary_fromtree)\n");
02025     exit(-1);  }
02026 
02027   // Creates the array for the starting positions
02028   dict->start_pos = (int *) malloc(dict->num_tokens * sizeof(int));
02029 
02030   if(dict->start_pos == NULL) {
02031     fprintf(stderr,"Error: memory allocation (HDictionary_fromtree)\n");
02032     exit(-1);  }
02033 
02034   // Initializes the data structures
02035   for(i=0,j=0; i<dict->length; j++){
02036     dict->start_pos[j] = i;
02037     i++;
02038     while (dict->content[i] != '\n')
02039       i++;
02040     i++;
02041   }
02042 
02043   if(j != dict->num_tokens) {
02044     fprintf(stderr,"Error: num tokens in dictionary (HDictionary_fromtree)\n");
02045     exit(-1);  }
02046 
02047 }
02048 
02049 
02058 void HDictionary_print(Dictionary *dict, HHash_table *ht, int Verbose)
02059 {
02060 
02061   int j,plaincw,lcw;
02062   Hash_node *p;
02063 
02064   printf("\n\n================== Dictionary ====================\n");
02065   printf("Number of tokens = %d, string length = %d\n\n",dict->num_tokens,dict->length);
02066 
02067   for(j=0;j<dict->num_tokens - 1;j++){
02068     printf("\n");
02069     HPrint_string(dict->content+dict->start_pos[j],dict->start_pos[j+1]-dict->start_pos[j]-1);
02070 
02071     if(Verbose) {
02072       if((p=HHashtable_search(dict->content+dict->start_pos[j],dict->start_pos[j+1]-dict->start_pos[j]-1,ht)) 
02073          == NULL){
02074         fprintf(stderr,"Error: token not found (HDictionary_print)\n");
02075         exit(-1);  }
02076       HCodeword_PlainFromTagged(p->codeword,p->cw_len,&plaincw,&lcw);  
02077       printf(":plain_cw = %x cwlen %d (bits)",plaincw,lcw);
02078     }
02079   }
02080 
02081   printf("\n"); // Printing the last dict item
02082   HPrint_string(dict->content+dict->start_pos[j],dict->length - dict->start_pos[j]-1);
02083 
02084   if(Verbose) {
02085     if((p=HHashtable_search(dict->content+dict->start_pos[j],dict->length - dict->start_pos[j]-1,ht)) == NULL){
02086       fprintf(stderr,"Error: token not found (HDictionary_print)\n");
02087       exit(-1);  }  
02088     HCodeword_PlainFromTagged(p->codeword,p->cw_len,&plaincw,&lcw);  
02089     printf(":plain_cw = %x cwlen %d (bits)\n\n",plaincw,lcw);
02090   }
02091 } 
02092 
02093 
02094 
02095 
02096 /* -------------------------------------------------------- */
02097 /* ----------------  Jumpers data structure --------------- */
02098 /* -------------------------------------------------------- */
02099 
02100 
02112 void HJumpers_fromds(char *text, int tlen, HHash_table *ht, 
02113                     int jump_value, Jumpers *jumpers)
02114 {
02115   int text_pos,ctext_pos,i;
02116   int token_len;
02117   int taggedcw;
02118   int taggedcwlen;
02119   int current_jump;
02120 
02121   jumpers->number = 0;
02122 
02123   if (jump_value != 0) {
02124     jumpers->number = tlen / jump_value + 1;
02125 
02126   jumpers->text_offsets = (int *) malloc(jumpers->number * sizeof(int));
02127   jumpers->ctext_offsets =  (int *) malloc(jumpers->number * sizeof(int));
02128 
02129   if((jumpers->text_offsets == NULL) || (jumpers->ctext_offsets == NULL)){
02130     fprintf(stderr,"Error: memory allocation (HJumpers_fromds)\n");
02131     exit(-1);  }
02132 
02133   current_jump = 0;
02134 
02135   for(text_pos=0,ctext_pos=0,i=0; (text_pos<tlen) && (i < jumpers->number); ){
02136     
02137     if (text_pos >= current_jump){ // candidate to store a jumper
02138       if ((i==0) || (jumpers->text_offsets[i-1] != text_pos)) { // it is a novel position
02139       jumpers->text_offsets[i] = text_pos; 
02140       jumpers->ctext_offsets[i] = ctext_pos;
02141       i++;
02142       }
02143       current_jump += jump_value;
02144     }
02145 
02146     // Update the current positions
02147     HToken_getLengthNext(text+text_pos,tlen-text_pos,&token_len);
02148     HCodeword_TaggedFromToken(text+text_pos,token_len,ht,&taggedcw,&taggedcwlen);
02149 
02150     text_pos += token_len;
02151     ctext_pos += taggedcwlen;
02152   }
02153 
02154   jumpers->number = i; // real number of initialized entries
02155 
02156   }
02157 
02158 }
02159 
02160 
02161 
02170 void HJumpers_tostring(char *s, Jumpers *jumpers)
02171 {
02172 
02173   int i,j;
02174 
02175   if(s == NULL) {
02176     fprintf(stderr,"Error: memory allocation (HJumpers_tostring)\n");
02177     exit(-1);  }
02178 
02179   HInt_tostring(s,jumpers->number);
02180   
02181   for(i=0,j=0;i<jumpers->number;i++){
02182     HInt_tostring(s + 4 + j,jumpers->text_offsets[i]);
02183     j += 4;
02184     HInt_tostring(s + 4 + j,jumpers->ctext_offsets[i]);
02185     j += 4;
02186   }
02187 
02188 }
02189 
02190 
02199 void HJumpers_fromstring(char *s, Jumpers *jumpers)
02200 {
02201 
02202   int i;
02203 
02204   jumpers->number = HInt_fromstring(s);
02205   s += 4;
02206 
02207   if (jumpers->number) {
02208     jumpers->text_offsets = (int *) malloc(jumpers->number * sizeof(int));
02209     jumpers->ctext_offsets =  (int *) malloc(jumpers->number * sizeof(int));
02210     
02211     if((jumpers->text_offsets == NULL) || (jumpers->ctext_offsets == NULL)){
02212       fprintf(stderr,"Error: memory allocation (HJumpers_fromstring)\n");
02213       exit(-1);  }
02214     
02215     for(i=0;i<jumpers->number;i++){
02216       jumpers->text_offsets[i]=HInt_fromstring(s);
02217       s += 4;
02218       jumpers->ctext_offsets[i]=HInt_fromstring(s);
02219       s += 4;
02220     }
02221   } else {
02222     jumpers->text_offsets = NULL;
02223     jumpers->ctext_offsets =  NULL;
02224   }    
02225 }
02226 
02227 
02239 void Get_charpos_from_bytepos(char *ctext, int ctext_len, int bytepos, int *textpos, Console *console)
02240 {
02241 
02242   int i;
02243   int current_ctextpos;
02244   char *token;
02245   int ltoken,lencw;
02246 
02247   if((ctext_len <= bytepos) || (bytepos < 0)){
02248     fprintf(stderr,"Error: uncorrect bytepos (Get_charpos_from_bytepos)\n");
02249     exit(-1);  }
02250 
02251   if((ctext[bytepos] & 0x80) == 0){
02252     fprintf(stderr,"Error: no codeword start (Get_charpos_from_bytepos)\n");
02253     exit(-1);  }
02254   
02255 
02256   for(i=0; (i < console->jumpers.number) && (console->jumpers.ctext_offsets[i]<= bytepos); i++) ;
02257   
02258   current_ctextpos = console->jumpers.ctext_offsets[i-1]; // it is a codeword start, by hypothesis
02259   *textpos = console->jumpers.text_offsets[i-1];
02260   
02261   for( ; current_ctextpos < bytepos; ) {
02262 
02263     HToken_decompressNext(ctext+current_ctextpos,ctext_len-current_ctextpos+1,
02264                          &token,&ltoken,&lencw,console);
02265     current_ctextpos += lencw;
02266     *textpos += ltoken;
02267 
02268     free(token);
02269   }
02270 
02271   if(current_ctextpos != bytepos){
02272     fprintf(stderr,"Error: no codeword start (Get_charpos_from_bytepos)\n");
02273     exit(-1);  }
02274 
02275 
02276 }
02277 
02278 
02291 void Get_bytepos_from_charpos(char *ctext, int ctext_len, int textpos, int *bytepos, Console *console)
02292 {
02293 
02294   int i;
02295   int current_textpos;
02296   char *token;
02297   int ltoken,lencw;
02298 
02299 
02300   if(textpos < 0){
02301     fprintf(stderr,"Error: negative text pos (From_textpos_to_ctextpos)\n");
02302     exit(-1);  }
02303 
02304   
02305   for(i=0; (i < console->jumpers.number) && (console->jumpers.text_offsets[i]<= textpos); i++) ;
02306   
02307   current_textpos = console->jumpers.text_offsets[i-1]; // it is a token start, by hypothesis
02308   *bytepos = console->jumpers.ctext_offsets[i-1];
02309   
02310   for( ; (current_textpos < textpos) && (*bytepos < ctext_len); ) {
02311 
02312     HToken_decompressNext(ctext + *bytepos, ctext_len - *bytepos + 1,&token,&ltoken,&lencw,console);
02313     current_textpos += ltoken; 
02314     *bytepos += lencw;
02315 
02316     free(token);
02317   }
02318 
02319   if(textpos != current_textpos){
02320     fprintf(stderr,"Error: no token beginning (From_textpos_to_ctextpos)\n");
02321     exit(-1);  }
02322 
02323 
02324 }
02325 
02326 
02327 
02328 
02329 
02330 
02331 
02332 /* ---------------------------------------------------------- */
02333 /* ----------------  Canonical data structure --------------- */
02334 /* ---------------------------------------------------------- */
02335 
02336 
02347 void HCanonical_fromtree(Hash_nodeptr_array tree_array, Canonical *cano, HHash_table *ht)
02348 {
02349   int i,j,tmp;
02350   int numcw[5];
02351   int HT_leaves;
02352   int HT_dummy_leaves, HT_nondummy_leaves;
02353   
02354 
02355   HT_dummy_leaves = 128 - (ht->card % 127);
02356   HT_nondummy_leaves = ht->card;
02357   HT_leaves = HT_nondummy_leaves + HT_dummy_leaves;
02358 
02359   /* Compute offset, numcw and first codeword per level */
02360   for(i=4;i>0;i--){
02361     numcw[i] = 0;
02362     cano->offsetcw[i] = 0;
02363     cano->firstcw[i] = 0;
02364   }
02365   
02366   // Recall that the leaves due to the sorting are allocated in
02367   // the part of the array Huff_root + [0...HT_leaves-1].
02368   // We count the number of leaves occurring at each level of the
02369   // tree.  Levels are counted from 1.
02370 
02371   for(i=0;i<HT_nondummy_leaves;i++)
02372     numcw[tree_array[i]->cw_len]++;
02373 
02374   // maximum length of the canonical code (as #7bit groups)
02375   for(cano->max_cwlen = 4 ; numcw[cano->max_cwlen] == 0; (cano->max_cwlen)--)
02376     ;
02377 
02378 
02379   // Compute the offsets and the first codewords.  Since we
02380   // counted only NON_dummy leaves, we assume that the dummys are
02381   // located leftward in the tree. Hence we set
02382   // firstcw[max_cwlen] = #dummy so that when summing the value
02383   // of count_depth[max_cwlen] we have exactly the #leaves at
02384   // the maximum level.
02385 
02386   cano->offsetcw[cano->max_cwlen] = 0;
02387   cano->firstcw[cano->max_cwlen] = HT_dummy_leaves;  
02388 
02389   for(i=cano->max_cwlen - 1;i>0;i--){
02390     cano->offsetcw[i] = cano->offsetcw[i+1]+numcw[i+1];
02391     cano->firstcw[i] = (cano->firstcw[i+1]+numcw[i+1]) >> 7; // fan-out = 128
02392   }
02393 
02394   // initializes the number of coded items
02395   cano->num_tokens=ht->card;
02396 
02397 
02398   // Two simple tricks.  
02399   //
02400   // We reset the first codeword of max length, thus pushing
02401   // the dummy leaves rightward in their level. There is no
02402   // problem in the decompression because we consider always
02403   // differences between firstcw and ranks.  And we assign
02404   // offsetcw[] = HT_leaves+1 to set that some cwlen is not
02405   // occurring
02406 
02407   cano->firstcw[cano->max_cwlen] = 0;  
02408   for(i=4;i>0;i--)
02409     if (numcw[i] == 0) cano->offsetcw[i] = HT_leaves+1;
02410 
02411   // Initializes the field codeword into the tree leaves, which are ht->card
02412   for(i=0,j=cano->max_cwlen; i<cano->num_tokens; i++){
02413 
02414     // Moves the codeword length
02415     if((j>1) && (cano->offsetcw[j-1]<=i))   j--;
02416 
02417     // Set properly the codeword and its length in bytes
02418     HCodeword_TaggedFromPlain(cano->firstcw[j]+(i-cano->offsetcw[j]),j*7,&(tree_array[i]->codeword),&tmp);
02419     tree_array[i]->cw_len = j;
02420   }
02421 
02422 }
02423 
02424 
02425 
02426 
02436 void HCanonical_fromstring(char *s, Canonical *cano)
02437 {
02438 
02439   int i;
02440 
02441   cano->max_cwlen = HInt_fromstring(s);
02442   s += 4;
02443 
02444   for(i=1;i<5;i++){
02445     cano->firstcw[i]=HInt_fromstring(s);
02446     s += 4;
02447     cano->offsetcw[i]=HInt_fromstring(s);
02448     s += 4;
02449   }
02450 
02451 }
02452 
02463 void HCanonical_tostring(char *s, Canonical *cano)
02464 {
02465 
02466   int i;
02467 
02468   HInt_tostring(s,cano->max_cwlen);
02469   s += 4;
02470 
02471   for(i=1;i<5;i++){
02472     HInt_tostring(s,cano->firstcw[i]);
02473     s += 4;
02474     HInt_tostring(s,cano->offsetcw[i]);
02475     s += 4;
02476   }
02477 
02478 }
02479 
02480 
02481 
02482 
02483 /* ------------------------------------------------------------ */
02484 /* ---------------  Routines for Huffman Tree   --------------- */
02485 /* ------------------------------------------------------------ */
02486 
02487 
02499 void Hufftree_createLeaves(Hash_nodeptr_array tree, int num_leaves, HHash_table *ht)
02500 {
02501   Hash_node *p;
02502   int i,j;
02503 
02504   // Initialize each Huffman leaf with an hash_table entry
02505   for(i=0,j=0; i<ht->card;j++){
02506     for(p = ht->table[j]; p ; p = p->next)
02507       tree[i++] = p;
02508   }
02509   // Append the dummy leaves: str = NULL and #occ = 0
02510   for(i=ht->card; i<num_leaves; i++){
02511     if ((tree[i] = (Hash_node *) malloc(sizeof(Hash_node))) == NULL){
02512       fprintf(stderr,"Error: Hufftree_createLeaves\n");
02513       exit(-1); 
02514     } else {
02515       tree[i]->str = NULL;
02516       tree[i]->len_str = 0;
02517       tree[i]->count_occ = 0;
02518     }
02519   }
02520      
02521   // Sort the leaves for decreasing frequency
02522   qsort(tree,num_leaves,sizeof(Hash_node *),HSort_for_freq);
02523 
02524 }
02525 
02526 
02540 int Hufftree_fromLeaves(Hash_nodeptr_array work_area, int *tot_nodes, int leaves)
02541 {
02542   int num_leaves,i,count,chosen_leaves;
02543   int processed, head, tail;
02544   Hash_node *p;
02545   Hash_nodeptr_array Node_queue; // Queue of internal nodes
02546 
02547   // It always points to the last empty position in the work-area
02548   // where nodes/leaves will be moved after being processed.
02549   processed = (leaves + (leaves-1)/127 + 1) -1;
02550   head = 0;
02551   tail = 0;
02552   chosen_leaves = 1;  //chooses from leaves' queue by default
02553   Node_queue=(Hash_nodeptr_array )malloc(sizeof(Hash_node *)*((leaves-1)/127+1));  
02554 
02555   if(Node_queue == NULL) {
02556     fprintf(stderr,"Out of mem: Node_queue creation in Hufftree_fromLeaves\n");
02557     exit(-1); }
02558   
02559   for(num_leaves = leaves; (num_leaves > 0) || (head != tail); ){ 
02560     
02561     // Select the smallest 128 items as children of the new node
02562     for(i = 0,count = 0; i < 128; i++) {   
02563       
02564       if (head == tail)  // Node_queue is empty
02565         chosen_leaves = 1; // Extract only leaves
02566 
02567       if(num_leaves == 0) // All leaves have been processed
02568         chosen_leaves = 0; // Extract from Node_queue
02569 
02570       if( (num_leaves > 0) && (head != tail)) {
02571         
02572         // minimize the depth
02573         if(work_area[num_leaves-1]->count_occ <= Node_queue[head]->count_occ)
02574           chosen_leaves = 1;
02575         else 
02576           chosen_leaves = 0;
02577       }
02578         
02579         
02580       if(chosen_leaves == 0){ // Extract from the Node_queue
02581         work_area[processed] = Node_queue[head];
02582         count += Node_queue[head]->count_occ;  
02583         head++;
02584         processed--;    
02585       } else {  // Extract from the leaves
02586         work_area[processed]=work_area[num_leaves-1];
02587         count += work_area[num_leaves-1]->count_occ;
02588         processed--;
02589         num_leaves--; }
02590     }
02591       
02592     // Allocate the new parent node
02593     if ((p = (Hash_node *) malloc(sizeof(Hash_node))) == NULL){
02594       fprintf(stderr,"Error: Token parsing (Hufftree_fromLeaves)\n");
02595       exit(-1); }
02596     p->str = NULL;   // it is an internal node
02597     p->len_str = 0;
02598     p->count_occ = count;
02599     p->codeword = 0;
02600     p->cw_len = 0;
02601     p->next = (Hash_node *) (processed + 1); //pointer to the leftmost child
02602     
02603     // Insertion of the new node in its correct position, if not root   
02604     if((num_leaves > 0) || (head != tail))
02605       Node_queue[tail++] = p; 
02606     else 
02607       work_area[processed] = p; // it is the root
02608   }
02609   
02610   *tot_nodes = leaves + (leaves-1)/127+1 - processed;
02611   free(Node_queue); 
02612   
02613   // returns the offset of the root node in the array
02614   return(processed);
02615 }
02616 
02617 
02633 int Hufftree_computeCwLen(Hash_nodeptr_array tree, int root, int tree_size)
02634 {   
02635   int offset_child,isleaf;
02636   int i,j;
02637   int max_length;
02638 
02639   max_length =0;
02640 
02641   // Visit the tree top-down <---> visit the array rightward
02642   for(i=0; i<tree_size; i++) {
02643     
02644     // Scan all the 128 children of a node if they do exist
02645     offset_child = (int) (tree[root+i]->next);  // get back the leftmost child
02646     
02647     // Node i is a leaf if it stores a string or is dummy
02648     isleaf = (tree[root+i]->str != NULL) || (tree[root+i]->count_occ == 0);  
02649     
02650     // Compute the depth by incrementing of one the parent's depth
02651     if (isleaf == 0){
02652       for(j=0; j<128; j++){ 
02653         if(tree[offset_child+j]->count_occ == 0){ // dummy leaf
02654           tree[offset_child+j]->cw_len = -1; // useful for next sorting step
02655         } else {
02656           tree[offset_child+j]->cw_len = (tree[root+i]->cw_len) + 1;
02657         }
02658         
02659         
02660         // Keep track of the maximum codeword length
02661         if(tree[offset_child+j]->cw_len > max_length)
02662           max_length = tree[offset_child+j]->cw_len;
02663       }
02664       
02665       tree[root+i]->cw_len = -1;  // internal node: useful for next sorting
02666     }
02667   }
02668   
02669   // Sort nodes for decreasing cw_len + alphabetically for equal
02670   // length. Internal nodes and dummy leaves go to the end because of
02671   // the -1 above.
02672   qsort(tree+root,tree_size,sizeof(Hash_node *),HSort_for_cwlen);
02673   
02674   return max_length;
02675 }
02676 
02677 
02688 void Hufftree_build(Hash_nodeptr_array *tree_array_ptr, HHash_table *ht)
02689 {
02690   Hash_node *p;
02691   int i,depth;
02692   int Huff_root;
02693   int HT_size, HT_leaves;
02694   int HT_dummy_leaves, HT_nondummy_leaves;
02695   
02696 
02697   HT_dummy_leaves = 128 - (ht->card % 127);
02698   HT_nondummy_leaves = ht->card;
02699   HT_leaves = HT_nondummy_leaves + HT_dummy_leaves;
02700 
02701   depth = 5; // dummy starting value
02702 
02703 
02704   // Allocate memory for the tree structure and dummy leaves
02705   (*tree_array_ptr) = (Hash_nodeptr_array )malloc(sizeof(Hash_node *)*
02706                                           (HT_leaves + (HT_leaves-1)/127+5));
02707 
02708   if(*tree_array_ptr == NULL) {
02709     fprintf(stderr,"Error: tree_array_ptr (Hufftree_build)\n");
02710     exit(-1);  }
02711   
02712   while (depth > 4) {
02713     
02714     // Create the leaves for the canonical tree in [0..HT_leaves] and
02715     // sort according to decreasing frequency (0 for dummy leaves)
02716     Hufftree_createLeaves(*tree_array_ptr,HT_leaves,ht);
02717 
02718     // Build the Huffman canonical tree; Huff_root is the pos in the
02719     // *tree_array_ptr where the tree nodes start. HT_size is the overall
02720     // numbet of tree nodes.
02721     Huff_root = Hufftree_fromLeaves(*tree_array_ptr,&HT_size,HT_leaves); 
02722 
02723     // Compute codeword lengths: sorted decreasing + alphabetically,
02724     // the leaves are allocated in the positions: Huff_root +
02725     // [0...HT_nondummy_leaves] in *tree_array_ptr
02726     depth = Hufftree_computeCwLen(*tree_array_ptr,Huff_root,HT_size);
02727 
02728     if(depth > 4) {
02729 
02730       for(i=0;i<HT_nondummy_leaves;i++){
02731         p = (*tree_array_ptr)[i+Huff_root]; 
02732         p->count_occ = p->count_occ / 1.618 + 1;  // golden ratio scaling
02733       }
02734     }
02735   }
02736 
02737   // Moves the pointer to the position where leaves are allocated
02738   *tree_array_ptr += Huff_root;
02739 
02740 }
02741 
02742 
02751 void Hufftree_print(Canonical *cano, Hash_nodeptr_array tree_array, int Verbose)
02752 {
02753   int i,j;
02754   int plaincw,lcw;
02755 
02756   printf("\n\n================== Huffman Tree ====================\n");
02757 
02758   printf("Number of distinct tokens %d\n",cano->num_tokens);
02759   printf("Max codeword length = %d (in groups of 7 bits)\n",cano->max_cwlen);
02760   for(i=cano->max_cwlen; i>0;i--)
02761     printf("Offset[%d] = %d, First_cw[%d] = %x\n",i,cano->offsetcw[i],i,cano->firstcw[i]);
02762   
02763   j=cano->max_cwlen;
02764 
02765   if(Verbose) {
02766 
02767     printf("\n ----- tokens -----\n\n");
02768     for(i=0; i<cano->num_tokens; i++){
02769 
02770       // Moves the codeword length
02771       if((j>1) && (cano->offsetcw[j-1]<=i))   j--;
02772       
02773       // Print useful infos
02774       printf("token = \"");
02775       HPrint_string(tree_array[i]->str,tree_array[i]->len_str);
02776       printf("\"");
02777       
02778       HCodeword_PlainFromTagged(tree_array[i]->codeword,tree_array[i]->cw_len,&plaincw,&lcw);
02779       printf(" token_len = %d, cwlen = %d (bits), untagged_cw = %x\n", tree_array[i]->len_str,plaincw,lcw);
02780     }
02781   }
02782     
02783 }
02784 
02785 
02786 
02787 
02788 
02789 
02790 
02791 
02792 /* ---------------------------------------------------------- */
02793 /* --------------------  Print functions -------------------- */
02794 /* ---------------------------------------------------------- */
02795 
02796 
02803 void HPrint_string(char *s, int l)
02804 {  
02805 
02806   int i;
02807 
02808   if (l<0){
02809     fprintf(stderr,"Error: negative string length (HPrint_string)\n");
02810     exit(-1); }
02811     
02812   for(i=0; i<l; i++){
02813     if(!isprint(s[i])){ printf("[%d]",s[i]); } //singleton
02814     else printf("%c",s[i]);
02815   }
02816 }
02817 
02818 
02819 
02820 
02821 
02822 
02823 
02824 
02825 
02826 
02827 
02828 
02829 
02830 
02831 
02832 
02833 
02834 
02835 
02836 
02837 
02838 
02839 /* ---------------------------------------------------- */
02840 /* -----------------  Basic routines    --------------- */
02841 /* ---------------------------------------------------- */
02842 
02843 
02848 void HInt_tostring(char *s, int i)
02849 {
02850   int j;
02851 
02852   for(j=3; j>=0; j--)
02853     s[3-j] = (char) ((i >> (8 * j)) & 0xff);
02854 }
02855 
02859 int HInt_fromstring(char *s) 
02860 {
02861   int j,num;
02862 
02863   num=0;
02864 
02865   for(j=0; j<4; j++)
02866     num = (num << 8) |  (s[j] & 0xff);
02867 
02868   return(num);
02869 }
02870 
02871 
02872 /* ---------------------------------------------------- */
02873 /* ---------------  Sorting routines    --------------- */
02874 /* ---------------------------------------------------- */
02875 
02876 
02877 /* ---------------- HSort_for_decreasing_freq ---------------- */
02878 int HSort_for_freq(const void *va, const void *vb) {
02879   return(((Hash_nodeptr_array ) vb)[0]->count_occ - ((Hash_nodeptr_array ) va)[0]->count_occ);
02880 }
02881 
02882 /* ---------------- HSort_for_decreasing_cwlen ---------------- */
02883 int HSort_for_cwlen(const void *va, const void *vb) {
02884 
02885   // Sort by codeword length
02886   if ((((Hash_nodeptr_array ) vb)[0]->cw_len - ((Hash_nodeptr_array ) va)[0]->cw_len) != 0)
02887     return((((Hash_nodeptr_array ) vb)[0]->cw_len - ((Hash_nodeptr_array ) va)[0]->cw_len));
02888 
02889   // Sort alphabetically the alnum strings, they have the same length
02890   if(((Hash_nodeptr_array ) vb)[0]->str == NULL)
02891     return(-1);
02892   if(((Hash_nodeptr_array ) va)[0]->str == NULL)
02893     return(1);
02894   return(memcmp(((Hash_nodeptr_array ) va)[0]->str,((Hash_nodeptr_array ) vb)[0]->str,
02895                 ((Hash_nodeptr_array ) va)[0]->len_str));
02896 }
02897 
02898 

Generated on Mon Mar 31 14:44:30 2003 by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002