Main Page   Alphabetical List   Compound List   File List   Compound Members   File Members  

HuffwordLib.c File Reference

Library of C procedures for canonical huffword (de)compression and its manipulation. The Library exports a set of C functions that allow to access and process all of its constitutent parts: dictionary, codewords, tokens, etc. etc. More...

#include "HuffwordLib.h"

Go to the source code of this file.

Functions

void Huffw_compress (char *text, int tlen, int jump_value, char **cstring, int *clen, int Verbose)
void Huffw_decompress (char *cstring, int clen, char **text, int *tlen, int Verbose)
void Huffw_spaceless_compress (char *text, int tlen, char **cstring, int *clen, int Verbose)
void Huffw_spaceless_decompress (char *cstring, int clen, char **text, int *tlen, int Verbose)
void Huffw_PrintInfo (Console *c, int ctext_len, int text_len, int cstring_len, int rule, int Verbose)
void HParse_text (char *text, int text_len, HHash_table *ht)
void HCompress_getBody (char *text, int text_len, HHash_table *ht, char **ctext, int *ctext_len)
void HCompress_getBodyAndConsole (char *text, int tlen, int jump_value, char **body, int *blen, Console *console)
void HCompress_getString (char **ctext, int *ctext_len, int text_len, char *body, int blen, Console *console)
void HCompress_contractSpaces (char *body, int blen, char **body_spaceless, int *blen_spaceless, Console *console)
void HDecompress_getBodyAndConsole (char *ctext, int ctext_len, int *text_len, char **body, int *blen, Console *console)
void HDecompress_expandSpaces (char *body_spaceless, int blen_spaceless, char **body, int *blen, Console *console)
void HDecompress_nextBlock_bytes (char *ctext, int num_bytes, char **text, int *text_len, Console *console)
void HDecompress_nextBlock_bytes_spaceless (char *ctext, int num_bytes, char **text, int *text_len, Console *console)
void HDecompress_previousBlock_bytes (char *ctext, int num_bytes, char **text, int *text_len, Console *console)
void HDecompress_previousBlock_bytes_spaceless (char *ctext, int num_bytes, char **text, int *text_len, Console *console)
void HDecompress_nextBlock_tokens (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console)
void HDecompress_nextBlock_tokens_spaceless (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console)
void HDecompress_previousBlock_tokens (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console)
void HDecompress_previousBlock_tokens_spaceless (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console)
void HToken_RankFromPlainCw (int plaincw, int plaincwlen, Canonical *cano, int *token_rank)
void HToken_RankFromTaggedCw (int taggedcw, int taggedcwlen, Canonical *cano, int *token_rank)
void HToken_fromTaggedCw (int taggedcw, int taggedcwlen, Canonical *cano, Dictionary *dict, char **token, int *token_len)
int HToken_decompressNext (char *s, int num_byte_left, char **token, int *lentoken, int *lencw, Console *console)
int HToken_decompressPrevious (char *s, int num_byte_left, char **token, int *lentoken, int *lencw, Console *console)
void HToken_getLengthNext (char *s, int num_char_left, int *len)
void HCodeword_PlainFromTagged (int taggedcw, int taggedcwlen, int *plaincw, int *plaincwlen)
void HCodeword_PlainFromTokenrank (int token_rank, Canonical *cano, int *plaincw, int *plaincwlen)
void HCodeword_TaggedFromPlain (int plaincw, int plaincwlen, int *taggedcw, int *taggedcwlen)
void HCodeword_TaggedFromTokenrank (int token_rank, Canonical *cano, int *taggedcw, int *taggedcwlen)
void HCodeword_TaggedFromToken (char *token, int token_len, HHash_table *ht, int *taggedcw, int *taggedcwlen)
void HCodeword_TaggedGetNext (char *s, int num_byte_left, int *taggedcw, int *lencw)
int HCodeword_TaggedGetNextLength (char *s, int num_byte_left)
void HCodeword_TaggedGetPrevious (char *s, int num_byte_left, int *taggedcw, int *lencw)
void HCodeword_tostring (int taggedcw, int taggedcwlen, char **s)
void HHashtable_init (HHash_table *ht, int n)
void HHashtable_print (HHash_table *ht)
int HHashtable_func (char *s, int len, HHash_table *ht)
Hash_nodeHHashtable_search (char *s, int slen, HHash_table *ht)
int HHashtable_insert (char *s, int slen, HHash_table *ht)
void HHashtable_clear (HHash_table *ht)
void HHashtable_fromdict (Dictionary *dict, Canonical *cano, HHash_table *ht)
void HDictionary_fromstring (char *s, int slen, int stokens, Dictionary *dict)
void HDictionary_fromtree (Hash_nodeptr_array tree, int num_tokens, Dictionary *dict)
void HDictionary_print (Dictionary *dict, HHash_table *ht, int Verbose)
void HJumpers_fromds (char *text, int tlen, HHash_table *ht, int jump_value, Jumpers *jumpers)
void HJumpers_tostring (char *s, Jumpers *jumpers)
void HJumpers_fromstring (char *s, Jumpers *jumpers)
void Get_charpos_from_bytepos (char *ctext, int ctext_len, int bytepos, int *textpos, Console *console)
void Get_bytepos_from_charpos (char *ctext, int ctext_len, int textpos, int *bytepos, Console *console)
void HCanonical_fromtree (Hash_nodeptr_array tree_array, Canonical *cano, HHash_table *ht)
void HCanonical_fromstring (char *s, Canonical *cano)
void HCanonical_tostring (char *s, Canonical *cano)
void Hufftree_createLeaves (Hash_nodeptr_array tree, int num_leaves, HHash_table *ht)
int Hufftree_fromLeaves (Hash_nodeptr_array work_area, int *tot_nodes, int leaves)
int Hufftree_computeCwLen (Hash_nodeptr_array tree, int root, int tree_size)
void Hufftree_build (Hash_nodeptr_array *tree_array_ptr, HHash_table *ht)
void Hufftree_print (Canonical *cano, Hash_nodeptr_array tree_array, int Verbose)
void HPrint_string (char *s, int l)
void HInt_tostring (char *s, int i)
int HInt_fromstring (char *s)
int HSort_for_freq (const void *va, const void *vb)
int HSort_for_cwlen (const void *va, const void *vb)


Detailed Description

Library of C procedures for canonical huffword (de)compression and its manipulation. The Library exports a set of C functions that allow to access and process all of its constitutent parts: dictionary, codewords, tokens, etc. etc.

Author:
Paolo Ferragina, Dipartimento di Informatica, Pisa (Italy).
Date:
Version 1.0, March 2003.
This file is licensed under LGPL terms (see file LICENSE)

Definition in file HuffwordLib.c.


Function Documentation

void Get_bytepos_from_charpos char *    ctext,
int    ctext_len,
int    textpos,
int *    bytepos,
Console   console
 

Computes the byte position (counting from 0) in the compressed text corresponding to a given char position (counting from 0) into the uncompressed text. We assume that the text position refers to the beginning of a token.

Parameters:
ctext  pointer to the compressed text.
ctext_len  length in bytes of the compressed text.
textpos  a text position corresponding to the beginning of a token.
bytepos  pointer to an integer that will contain the byte position to be computed.
console  pointer to the data structure containing all the compression infos.

Definition at line 2291 of file HuffwordLib.c.

void Get_charpos_from_bytepos char *    ctext,
int    ctext_len,
int    bytepos,
int *    textpos,
Console   console
 

Computes the text position (counting from 0) corresponding to a given byte position (counting from 0) in the compressed text. We assume that the byte position corresponds to the beginning of a codeword.

Parameters:
ctext  string containing the compressed text.
ctext_len  length in bytes of the compressed text.
bytepos  byte position corresponding to the beginning of a codeword.
textpos  pointer to the integer that will contain the text position corresponding to bytepos.
console  pointer to the data structure containing all the compression infos.

Definition at line 2239 of file HuffwordLib.c.

void HCanonical_fromstring char *    s,
Canonical   cano
 

Computes the canonical infos from a string, assuming the sequence max_cwlen,firstcw[1],offsetcw[1], .., firstcw[4],offsetcw[4]. Since they are just 9 integers (36 bytes) we do not use any succinct encoding nor summarization.

Parameters:
s  string containing the serialized canonical information.
cano  pointer to the data structure where this information will be loaded.

Definition at line 2436 of file HuffwordLib.c.

void HCanonical_fromtree Hash_nodeptr_array    tree_array,
Canonical   cano,
HHash_table   ht
 

Computes the canonical huffman infos and updates the hash table by setting for each token its byte-aligned and tagged codeword and its codeword length in bytes.

Parameters:
tree_array  pointer to the array of huffman tree leaves, kept as pointers to Hash nodes.
cano  pointer to the data structure that will contain the canonical information.
ht  pointer to the hash table containing all the tokens (this is updated with taggedcw and their lengths.

Definition at line 2347 of file HuffwordLib.c.

void HCanonical_tostring char *    s,
Canonical   cano
 

Stores the canonical infos into a (preallocated) string s, assuming the sequence max_cwlen,firstcw[1],offsetcw[1], ...., firstcw[4],offsetcw[4]. Since they are just 9 integers we do not use any succinct encoding nor summarization. The procedure reads just the first 36 bytes from s.

Parameters:
s  preallocated string of length at least 36 chars, here the canonical infos will be stored.
cano  pointer to the data structure containing the infos to be written in s.

Definition at line 2463 of file HuffwordLib.c.

void HCodeword_PlainFromTagged int    taggedcw,
int    taggedcwlen,
int *    plaincw,
int *    plaincwlen
 

Computes the plain codeword corresponding to an input tagged and byte-aligned codeword.

Parameters:
taggedcw  byte-aligned and tagged codeword to be translated.
taggedcwlen  number of bytes constituting the above taggedcw.
plaincw  pointer to an integer that will contain the computed plain codeword.
plaincwlen  pointer to the integer that will contain the length in bits of the plain codeword.

Definition at line 1443 of file HuffwordLib.c.

void HCodeword_PlainFromTokenrank int    token_rank,
Canonical   cano,
int *    plaincw,
int *    plaincwlen
 

Computes the plain codeword of a token from its rank in the dictionary. If some codeword length is absent then its firstcw[]=+infty.

Parameters:
token_rank  position in the dictionary of the input token (starts from 0).
cano  pointer to a data structure containing the canonical infos.
plaincw  pointer to an integer eventually containing the computed plain codeword.
plaincwlen  pointer to the integer that will contain the length in bits of the plain codeword.

Definition at line 1472 of file HuffwordLib.c.

void HCodeword_TaggedFromPlain int    plaincw,
int    plaincwlen,
int *    taggedcw,
int *    taggedcwlen
 

Computes the tagged and byte-aligned codeword from its plain counterpart.

Parameters:
plaincw  input plain codeword to be translated.
plaincwlen  length in bits of the plain codeword.
taggedcw  pointer to the integer that will contain the output tagged codeword.
taggedcwlen  pointer to the integer that will contain the length in bytes of the tagged codeword.

Definition at line 1497 of file HuffwordLib.c.

void HCodeword_TaggedFromToken char *    token,
int    token_len,
HHash_table   ht,
int *    taggedcw,
int *    taggedcwlen
 

Computes the tagged and byte-aligned codeword for the passed token.

Parameters:
token  pointer to the string containing the input token.
token_len  length of the input token.
ht  pointer to the hash table containing the parsed tokens.
taggedcw  pointer to the integer that will contain the output tagged codeword.
taggedcwlen  pointer to the length in bytes of the computed tagged codeword.

Definition at line 1553 of file HuffwordLib.c.

void HCodeword_TaggedFromTokenrank int    token_rank,
Canonical   cano,
int *    taggedcw,
int *    taggedcwlen
 

Computes the byte-aligned and tagged codeword of a token given its rank in the dictionary (starts from 0). Recall that if some codeword length is absent then its firstcw[]=+infty.

Parameters:
token_rank  position in the dictionary of a token.
cano  pointer to a data structure containing the canonical infos.
taggedcw  pointer to the integer that will contain the computed tagged codeword.
taggedcwlen  pointer to the integer that will contain the length in byte of the tagged codeword.

Definition at line 1525 of file HuffwordLib.c.

void HCodeword_TaggedGetNext char *    s,
int    num_byte_left,
int *    taggedcw,
int *    lencw
 

Reads a sequence of bytes corresponding to the next tagged codeword in s. Its length, in bytes, is returned in lencw; the codeword is assigned to *taggedcw and allocated into its least significant bytes.

Parameters:
s  string containing the tagged codeword to be read.
num_byte_left  number of bytes remaining in s.
taggedcw  pointer to the integer that will contain the tagged codeword (allocated rightword).
lencw  pointer to the integer containing the codeword length (in bytes).

Definition at line 1578 of file HuffwordLib.c.

int HCodeword_TaggedGetNextLength char *    s,
int    num_byte_left
 

Returns the length of the tagged codeword starting at s.

Parameters:
s  string containing the tagged codeword to be read.
num_byte_left  number of bytes remaining in s.

Definition at line 1609 of file HuffwordLib.c.

void HCodeword_TaggedGetPrevious char *    s,
int    num_byte_left,
int *    taggedcw,
int *    lencw
 

Reads the sequence of bytes preceding the one pointed to by s and corresponding to the previous tagged codeword. The codeword length is returned in lencw (it is smaller than 4); the codeword is assigned to *taggedcw and allocated into its least significant bytes.

Parameters:
s  pointer to a char position, we need to read backward from it.
num_byte_left  number of bytes remaining to the left of s.
taggedcw  pointer to the integer that will contain the tagged codeword (allocated rightword).
lencw  pointer to the integer containing the codeword length (in bytes).

Definition at line 1646 of file HuffwordLib.c.

void HCodeword_tostring int    taggedcw,
int    taggedcwlen,
char **    s
 

Transforms a (tagged) codeword into a string of bytes (ended by \0).

Parameters:
taggedcw  the input tagged codeword.
taggedcwlen  its length in bytes.
s  memory address of the pointer to the output string.

Definition at line 1682 of file HuffwordLib.c.

void HCompress_contractSpaces char *    body,
int    blen,
char **    body_spaceless,
int *    blen_spaceless,
Console   console
 

Removes the single spaces which occur between two consecutive alphanumeric tokens. This is the so called Spaceless model.

Parameters:
body  pointer to the compressed body string.
blen  length of the compressed body.
body_spaceless  address of the pointer to the body compressed according to Spaceless model.
blen_spaceless  length of the spaceless body string.
console  pointer to the data structure containing all the compression info.

Definition at line 511 of file HuffwordLib.c.

void HCompress_getBody char *    text,
int    text_len,
HHash_table   ht,
char **    ctext,
int *    ctext_len
 

Construct the byte-aligned and tagged Huffword string, exploiting the tagged and byte-aligned codewords available in the hash-table data structure.

Parameters:
text  pointer to the string to be compressed.
text_len  length of the string to be compressed.
ht  pointer ot the hash table containing the coded tokens and their codewords.
ctext  address of the string pointer that will refer to the compressed string.
ctext_len  address of the integer that will contain the length of the compressed string.

Definition at line 317 of file HuffwordLib.c.

void HCompress_getBodyAndConsole char *    text,
int    tlen,
int    jump_value,
char **    body,
int *    blen,
Console   console
 

Builds all the data structures needed for the Huffword compression. Actually it builds the Huffman tree, and then computes the Canonical data structure, the Dictionary and derives the compressed body string as well the Jumpers, if jump_value is greater than zero.

Parameters:
text  pointer to the text string to be compressed.
tlen  text length.
jump_value  average number of chars to skip between consecutive jumpers (if 0, no jumpers).
body  address of the pointer that will contain the compressed body string (mmapped from ctext).
blen  pointer to the integer that will contain the length of the compressed body.
console  pointer to the data structure containing all the compression info.

Definition at line 387 of file HuffwordLib.c.

void HCompress_getString char **    ctext,
int *    ctext_len,
int    text_len,
char *    body,
int    blen,
Console   console
 

Composes the entire compressed string allocating it from scratch. It will be pointed to by *ctext and of length *ctext_len. Recall that offsetcw[i] is set ideally to +infty if no codeword of length 'i' is present in the dictionary. The composed string is formed as follows:

  • Compression type (1 byte), 0 indicates 'plain' and 1 'spaceless';
  • Text length (4 bytes);
  • Number of tokens in the dictionary (4 bytes);
  • Max codeword length (4 bytes);
  • Firstcw[1], Offsetcw[1], ..., Firstcw[4], Offsetcw[4] (32 bytes);
  • Number of jumpers (4 bytes);
  • Array of text and ctext jumpers (8 bytes each pair), alternating [possibly absent];
  • The dictionary length (4 bytes);
  • The dictionary content, as a string of tokens separated by 'newline';
  • Compressed text length (4 bytes)
  • Compressed text;
Parameters:
ctext  address of the pointer that will refer to the compressed file string, to be computed.
ctext_len  pointer to the integer that will contain the length of the compressed file.
text_len  length of the text to be compressed.
body  pointer to the compressed text (already computed).
blen  length of the compressed text.
console  pointer to the data structure containing all the compression infos.

Definition at line 449 of file HuffwordLib.c.

void HDecompress_expandSpaces char *    body_spaceless,
int    blen_spaceless,
char **    body,
int *    blen,
Console   console
 

Re-insert the single spaces that were dropped during the Spaceless compression. The programmer must take care to apply this procedure in the right case.

Parameters:
body_spaceless  pointer to the Spaceless compressed body.
blen_spaceless  length of the spaceless body string.
body  address of the pointer that will contain the compressed body string (with spaces).
blen  pointer to the integer that will contain the length of the compressed body.
console  pointer to the data structure containing all the compression info.

Definition at line 674 of file HuffwordLib.c.

void HDecompress_getBodyAndConsole char *    ctext,
int    ctext_len,
int *    text_len,
char **    body,
int *    blen,
Console   console
 

Initializes all data structures and the Console loading their content from the compressed string (including the header, dictionary, ...). Recall that it is offsetcw[i] > tokens if does not exist any codeword of length i. The hash table contains for each token its corresponding tagged codeword and its length in bytes.

Parameters:
ctext  pointer to the string that contains the compressed file string.
ctext_len  length of the compressed file string.
text_len  pointer to the integer that will contain the length of the uncompressed text.
body  address of the pointer that will contain the compressed body string (mmapped from ctext).
blen  pointer to the integer that will contain the length of the compressed body.
console  pointer to the data structure containing all the compression info.

Definition at line 609 of file HuffwordLib.c.

void HDecompress_nextBlock_bytes char *    ctext,
int    num_bytes,
char **    text,
int *    text_len,
Console   console
 

Decompresses a piece of compressed text for the specified number of bytes. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.

Parameters:
ctext  pointer to the block to be decompressed.
num_bytes  number of bytes to decompress.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
console  pointer to the data structure that contains all the compression infos.

Definition at line 761 of file HuffwordLib.c.

void HDecompress_nextBlock_bytes_spaceless char *    ctext,
int    num_bytes,
char **    text,
int *    text_len,
Console   console
 

Decompresses a piece of compressed text for the specified number of bytes in the Spaceless model. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist and that we are in the case of the spaceless compression (ie. first reserved byte is 1).

Parameters:
ctext  pointer to the block to be decompressed.
num_bytes  number of bytes to decompress.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
console  pointer to the data structure that contains all the compression infos.

Definition at line 827 of file HuffwordLib.c.

void HDecompress_nextBlock_tokens char *    ctext,
int    bytes_left,
int    num_obj,
char **    text,
int *    text_len,
int *    decoded,
Console   console
 

Decompresses a piece of compressed text for a specified number of alphanumeric tokens. The Console data structure must be initialized. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.

Parameters:
ctext  pointer to the block to be decompressed.
bytes_left  number of bytes yet available to be decompressed.
num_obj  number of alphanumeric tokens to be decompressed.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
decoded  address of the integer indicating the number of decompressed bytes.
console  pointer to the data structure that contains all the compression infos.

Definition at line 1004 of file HuffwordLib.c.

void HDecompress_nextBlock_tokens_spaceless char *    ctext,
int    bytes_left,
int    num_obj,
char **    text,
int *    text_len,
int *    decoded,
Console   console
 

Decompresses a piece of compressed text for a specified number of alphanumeric tokens in the spaceless model. The Console data structure must be initialized. The programmer must ensure that the compression type is 'spaceless' (ie. first reserved byte is 1).

Parameters:
ctext  pointer to the block to be decompressed.
bytes_left  number of bytes yet available to be decompressed.
num_obj  number of alphanumeric tokens to be decompressed.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
decoded  address of the integer indicating the number of decompressed bytes.
console  pointer to the data structure that contains all the compression infos.

Definition at line 1061 of file HuffwordLib.c.

void HDecompress_previousBlock_bytes char *    ctext,
int    num_bytes,
char **    text,
int *    text_len,
Console   console
 

Decompresses a piece of compressed text lying to the left of "ctext" for a specified number of bytes. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.

Parameters:
ctext  pointer to the block to be decompressed.
num_bytes  number of bytes to decompress.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
console  pointer to the data structure that contains all the compression infos.

Definition at line 875 of file HuffwordLib.c.

void HDecompress_previousBlock_bytes_spaceless char *    ctext,
int    num_bytes,
char **    text,
int *    text_len,
Console   console
 

Decompresses a piece of compressed text to the left of "ctext" pointer for "num_bytes" bytes in the spaceless model. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist and that the string has been compressed via 'spaceless model' (ie. first reserved byte is 1).

Parameters:
ctext  pointer to the block to be decompressed.
num_bytes  number of bytes to decompress.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
console  pointer to the data structure that contains all the compression infos.

Definition at line 952 of file HuffwordLib.c.

void HDecompress_previousBlock_tokens char *    ctext,
int    bytes_left,
int    num_obj,
char **    text,
int *    text_len,
int *    decoded,
Console   console
 

Decompresses the piece of compressed text that precedes "ctext" and for a specified number of alphanumeric tokens. The Console data structure must be initialized. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.

Parameters:
ctext  pointer to the block to be decompressed.
bytes_left  number of bytes yet available to be decompressed to the left of "ctext".
num_obj  number of alphanumeric tokens to be decompressed.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
decoded  address of the integer indicating the number of decompressed bytes.
console  pointer to the data structure that contains all the compression infos.

Definition at line 1115 of file HuffwordLib.c.

void HDecompress_previousBlock_tokens_spaceless char *    ctext,
int    bytes_left,
int    num_obj,
char **    text,
int *    text_len,
int *    decoded,
Console   console
 

Decompresses the piece of compressed text that precedes "ctext" and for a specified number of alphanumeric tokens in the Spaceless model. The Console data structure must be initialized. The programmer must ensure that the compression type is 'spaceless' (ie. first reserved byte is 1).

Parameters:
ctext  pointer to the block to be decompressed.
bytes_left  number of bytes yet available to be decompressed to the left of "ctext".
num_obj  number of alphanumeric tokens to be decompressed.
text  address of the pointer to the decompressed block (allocated by the procedure).
text_len  address of the integer that will contain the length of the decompressed block.
decoded  address of the integer indicating the number of decompressed bytes.
console  pointer to the data structure that contains all the compression infos.

Definition at line 1176 of file HuffwordLib.c.

void HDictionary_fromstring char *    s,
int    slen,
int    stokens,
Dictionary   dict
 

Computes from a given string containing the tokens separated by 'newline' the dictionary data structure (the last token is also ended by 'newline'). The string must be kept since the CONTENT field refers to it.

Parameters:
s  string containing the tokens separated by 'newline' (even the last one).
slen  length of the string s.
stokens  number of tokens in the string s.
dict  pointer to the Dictionary data structure to be built.

Definition at line 1954 of file HuffwordLib.c.

void HDictionary_fromtree Hash_nodeptr_array    tree,
int    num_tokens,
Dictionary   dict
 

Computes the dictionary data structure from the leaves of a Huffman tree, they are ordered by codeword length and alphabetically.

Parameters:
tree  pointer to an array of Hash_node pointers corresponding to the Huffman tree leaves.
num_tokens  number of non-dummy leaves.
dict  pointer to the Dictionary data structure to be built.

Definition at line 1998 of file HuffwordLib.c.

void HDictionary_print Dictionary   dict,
HHash_table   ht,
int    Verbose
 

Print the dictionary content and, also, the codewords for every token according to the format token:cw cwlen if Verbose = 1.

Parameters:
dict  pointer to the Dictionary data structure to print.
ht  pointer to the hash table containing the parsed tokens and their codewords.
Verbose  set to 1 if all those infos must be printed out. 0 just general Dictionary info.

Definition at line 2058 of file HuffwordLib.c.

void HHashtable_clear HHash_table   ht
 

Frees all elements of a hashtable. After this call, ht is an empty, uninitialized HHash_table.

Parameters:
ht  pointer to a HHash_table

Definition at line 1871 of file HuffwordLib.c.

void HHashtable_fromdict Dictionary   dict,
Canonical   cano,
HHash_table   ht
 

Computes an index data structure for the dictionary tokens, based on a hash table and the canonical info; each entry of the hash table contains also the corresponding tagged and byte-aligned codeword and its length (as bytes).

Parameters:
dict  pointer to a Dictinary data structure (initialized).
cano  pointer to the data structure containing the canonical infos (initialized).
ht  pointer to an (empty) hash table, it will be initialized.

Definition at line 1900 of file HuffwordLib.c.

int HHashtable_func char *    s,
int    len,
HHash_table   ht
 

Computes the hash value for the given string. The function was proposed by Ramakrishnais and Zobel in a paper appeared in: Int. Conf. on DB Systems for advanced applications, 1997.

Parameters:
s  input string whose hash value has to be computed.
len  length of s.
ht  pointer to an hash table.

Definition at line 1776 of file HuffwordLib.c.

void HHashtable_init HHash_table   ht,
int    n
 

Initialize the hash table according to the number of estimated tokens; the load factor is set to 10 and the lists are managed via the MTF-rule.

Parameters:
ht  pointer to an (empty) hash table.
n  estimated number of items to be inserted.

Definition at line 1722 of file HuffwordLib.c.

int HHashtable_insert char *    s,
int    slen,
HHash_table   ht
 

Inserts the token in the hash table and returns 1 if new, 0 otherwise; it also updates the counter of occurrences for that token.

Parameters:
s  string to be inserted.
slen  length of s (to manage also the NULL char).
ht  pointer to an hash table, it will be updated.

Definition at line 1815 of file HuffwordLib.c.

void HHashtable_print HHash_table   ht
 

Print the content of the hash table ht: token, length, count occurrences, tagged cw and its length.

Parameters:
ht  pointer to the hash table to be printed out.

Definition at line 1747 of file HuffwordLib.c.

Hash_node* HHashtable_search char *    s,
int    slen,
HHash_table   ht
 

Searches for the given string into the passed hash table (NULL if not).

Parameters:
s  string to be searched.
slen  length of s (to manage also the NULL char).
ht  pointer to the hash table to be searched.

Definition at line 1795 of file HuffwordLib.c.

int HInt_fromstring char *    s
 

Reads the first 4 bytes of string s and interprets them as an integer.

Parameters:
s  input string.

Definition at line 2859 of file HuffwordLib.c.

void HInt_tostring char *    s,
int    i
 

Stores the integer "i" into the first 4 bytes of "s".

Parameters:
s  output string.
i  integer to be serialized

Definition at line 2848 of file HuffwordLib.c.

void HJumpers_fromds char *    text,
int    tlen,
HHash_table   ht,
int    jump_value,
Jumpers   jumpers
 

Computes the jumper data structure according to a number of skipped chars. Since the skipping may end up into a word, then we need to store two integers, the offset in the uncompressed text and the offset into the compressed text.

Parameters:
text  pointer to the text string.
tlen  text length.
ht  pointer to the hash table containing the text tokens.
jump_value  number of chars to be skipped by every jumper pointer.
jumpers  pointer to the Jumper data structure to be built.

Definition at line 2112 of file HuffwordLib.c.

void HJumpers_fromstring char *    s,
Jumpers   jumpers
 

Loads the number of jumpers, and then the offsets in the text and the offsets in the compressed text from the string s, alternatively.

Parameters:
s  string containing the jumpers, it starts with their number.
jumpers  pointer to the Jumper data structure to be built.

Definition at line 2199 of file HuffwordLib.c.

void HJumpers_tostring char *    s,
Jumpers   jumpers
 

Serializes the Jumper data structure by storing in the string s the offsets in the text and the offsets in the compressed text, alternatively; it appends to the front of the string s the number of jumpers it has written.

Parameters:
s  output string.
jumpers  pointer to the Jumper data structure to be serialized.

Definition at line 2170 of file HuffwordLib.c.

void HParse_text char *    text,
int    text_len,
HHash_table   ht
 

Parses the text using the procedure HToken_getLengthNext() and fills in the hash table with items containing for each token, its number of occurrences in the text (count_occ), the token string (str) and its length (len_str). These infos will be then used by the procedure to construct the Huffman tree. The programmer has to take care of the initialization of the hash table, via HHashtable_init(). This way, it is possible to parse a sequence of texts and build one unique Huffamn tree for them.

Parameters:
text  pointer to the string to be parsed.
text_len  length of the string to be parsed.
ht  pointer to the hash table that will contain the parsed tokens and their countings.

Definition at line 273 of file HuffwordLib.c.

void HPrint_string char *    s,
int    l
 

Prints the first l chars of the string s by using [x] to denote an unprintable ASCII value x.

Parameters:
s  string to be printed.
l  length of the prefix to print.

Definition at line 2803 of file HuffwordLib.c.

int HSort_for_cwlen const void *    va,
const void *    vb
 

Definition at line 2883 of file HuffwordLib.c.

int HSort_for_freq const void *    va,
const void *    vb
 

Definition at line 2878 of file HuffwordLib.c.

int HToken_decompressNext char *    s,
int    num_byte_left,
char **    token,
int *    lentoken,
int *    lencw,
Console   console
 

Decompresses the next token starting from the byte position indicated by s. It also computes the length of this token and the length (in bytes) of the decompressed codeword. The procedure returns 0 if the operation was applied on <=0 bytes, otherwise it returns 1.

Parameters:
s  pointer to the byte position where the decompression must start.
num_byte_left  number of bytes to the right of s.
token  address of the string pointer that will contain the decompressed token.
lentoken  address of the integer that will contain the length of the token.
lencw  address of the integer that will contain the byte length of the decompressed codeword.
console  pointer to the data structure containing all the compression infos.

Definition at line 1333 of file HuffwordLib.c.

int HToken_decompressPrevious char *    s,
int    num_byte_left,
char **    token,
int *    lentoken,
int *    lencw,
Console   console
 

Decompresses the token whose codeword precedes the byte position indicated by s. It also computes the length of this token and the length (in bytes) of the decompressed codeword. The procedure returns 0 if the operation was applied on <=0 bytes, otherwise it returns 1.

Parameters:
s  pointer to the byte position where the decompression must start.
num_byte_left  number of bytes to the left of s.
token  address of the string pointer that will contain the decompressed token.
lentoken  address of the integer that will contain the length of the token.
lencw  address of the integer that will contain the byte length of the decompressed codeword.
console  pointer to the data structure containing all the compression infos.

Definition at line 1366 of file HuffwordLib.c.

void HToken_fromTaggedCw int    taggedcw,
int    taggedcwlen,
Canonical   cano,
Dictionary   dict,
char **    token,
int *    token_len
 

Determines the token corresponding to a given tagged codeword.

Parameters:
taggedcw  tagged codeword of a token.
taggedcwlen  length in bytes of that codeword.
cano  pointer to a data structure containing the canonical infos.
dict  pointer to a Dictionary data structure.
token  pointer to the retrieved token.
token_len  pointer to its computed length.

Definition at line 1290 of file HuffwordLib.c.

void HToken_getLengthNext char *    s,
int    num_char_left,
int *    len
 

Returns the length of a token defined as either a sequence of letters and numbers, or as a sequnece of separators different of 'newline', or as a single 'newline'. This distinction is introduced because the Dictionary string is formed by putting one token per line, hence if 'newline' occurs inside a token than problems may arise.

Parameters:
s  pointer the beginning of the next token in the parsed text.
num_char_left  number of chars to be scanned in s.
len  pointer to the integer that will contain the length of the parsed token.

Definition at line 1397 of file HuffwordLib.c.

void HToken_RankFromPlainCw int    plaincw,
int    plaincwlen,
Canonical   cano,
int *    token_rank
 

Computes the position in the dictionary of the token having the passed plain codeword. Recall that the dictionary is ordered by decreasing codeword length and alphabetically among equally-long codewords.

Parameters:
plaincw  plain codeword of a token.
plaincwlen  length in bits of that codeword.
cano  pointer to a data structure containing the canonical infos.
token_rank  pointer to the integer that will contain the token position in the dictionary.

Definition at line 1235 of file HuffwordLib.c.

void HToken_RankFromTaggedCw int    taggedcw,
int    taggedcwlen,
Canonical   cano,
int *    token_rank
 

Computes the position in the dictionary of the token having the passed tagged codeword. Recall that the dictionary is ordered by decreasing codeword length and alphabetically among equally-long codewords.

Parameters:
taggedcw  tagged codeword of a token.
taggedcwlen  length in bytes of that codeword.
cano  pointer to a data structure containing the canonical infos.
token_rank  pointer to the integer that will contain the token position in the dictionary.

Definition at line 1260 of file HuffwordLib.c.

void Hufftree_build Hash_nodeptr_array   tree_array_ptr,
HHash_table   ht
 

Constructs the Huffman tree with a fan-out 128, over the set of tokens contained into the hash table HT. It returns the pointer to the array of tree leaves ordered by decreasing codeword length and alphabetically among equally-long codewords.

Parameters:
tree_array_ptr  pointer to the Huffman tree to be built.
ht  pointer to the hash table containing the parsed tokens (with their frequency).

Definition at line 2688 of file HuffwordLib.c.

int Hufftree_computeCwLen Hash_nodeptr_array    tree,
int    root,
int    tree_size
 

Computes the codeword lengths by visiting top-down the Huffman tree. The depth of a node is propagated to its children, incremented by one; these children are stored contiguously and the position of the leftmost child is kept in the field "next" of the Hash_node pointed to by a tree node. At the end the leaves are stored beginning at position "root", are sorted for decreasing codeword length and among equally-long cw they are sorted by alphabetic order. The procedure returns the maximum codeword length just computed.

Parameters:
tree  array of pointer to Hash_nodes containing the parsed tokens and representing the tree leaves.
root  position in the array "tree" where the root node is allocated.
tree_size  overall number of tree nodes.

Definition at line 2633 of file HuffwordLib.c.

void Hufftree_createLeaves Hash_nodeptr_array    tree,
int    num_leaves,
HHash_table   ht
 

Uses a hash table containing all the parsed tokens, an array of pointers to the the leaves of the Huffman tree (to be constructed), and an integer indicating how many leaves should be formed including the dummy ones, whose count_occ is set to 0 and str = NULL.

Parameters:
tree  pointer to an array of Hash_node pointers (it will be initialized).
num_leaves  total number of tree leaves, including the dummy ones.
ht  pointer to the hash table containing the parsed tokens.

Definition at line 2499 of file HuffwordLib.c.

int Hufftree_fromLeaves Hash_nodeptr_array    work_area,
int *    tot_nodes,
int    leaves
 

Builds an Huffman tree with fan-out 128 starting from the array of leaves (pointers to Hash_nodes) stored in work_area. The algorithm uses the trick if Kunth which exploits to queues kept sorted by increasing frequency of nodes. The leaves are initially allocated to the beginning of work_area; processed nodes are moved to the end of work_area. The procedure eventually returns the position in work_area where the root of the built Huffman tree is allocated.

Parameters:
work_area  an array of pointers to Hash_nodes, the ones containing the tokens.
tot_nodes  overall number of nodes in the Huffman tree.
leaves  overall number of leaves in the Huffman tree.

Definition at line 2540 of file HuffwordLib.c.

void Hufftree_print Canonical   cano,
Hash_nodeptr_array    tree_array,
int    Verbose
 

Print infos about the Canonical Huffword code. Recall that we have offsetcw[] > num_tokens if the corresponding cwlen is not occurring.

Parameters:
cano  pointer to the Canonical data structure to print.
tree_array  array of pointers to Hash-nodes corresponding to the Huffman tree leaves.
Verbose  set to 1 to print also the tokens and their codewords.

Definition at line 2751 of file HuffwordLib.c.

void Huffw_compress char *    text,
int    tlen,
int    jump_value,
char **    cstring,
int *    clen,
int    Verbose
 

Compresses the text string by using Compose_compressed_string() and setting the first (reserved) byte of the produced string to 0, denoting the case of 'plain compression'.

Parameters:
text  pointer to the text string to be compressed.
tlen  length of the text string.
jump_value  average char distance between consecutive jumpers, if 0 they are not computed.
cstring  address of the pointer that will refer to the computed compressed string.
clen  pointer to the integer that will contain the length of the compressed string.
Verbose  if 1 the procedure prints infos on the compression process.

Definition at line 50 of file HuffwordLib.c.

void Huffw_decompress char *    cstring,
int    clen,
char **    text,
int *    tlen,
int    Verbose
 

Decompresses the compressed string; if the first (reserved) byte is not 0, indicating not plain compression, an error is returned.

Parameters:
cstring  pointer to the compressed string.
clen  length of the compressed string.
text  address of the pointer to the uncompressed text string, to be computed.
tlen  pointer to the integer that will contain the length of the uncompressed text.
Verbose  if 1 the procedure prints infos on the compressed string.

Definition at line 93 of file HuffwordLib.c.

void Huffw_PrintInfo Console   c,
int    ctext_len,
int    text_len,
int    cstring_len,
int    rule,
int    Verbose
 

Print compression infos. It is offsetcw[] > num_tokens if the corresponding cwlen is not occurring. If Verbose > 1 also the dictionary tokens and their codewords are printed.

Parameters:
c  pointer to the Console data structure containing all the compression infos.
ctext_len  length of the compressed text.
text_len  length of the plain text.
cstring_len  length of the overall compressed file (including header, dictionary, and compressed body).
rule  indicates the parsing rule, 0 = Plain, 1 = Spaceless.
Verbose  set to 1 for printing out also the dictionary tokens and their codewords.

Definition at line 225 of file HuffwordLib.c.

void Huffw_spaceless_compress char *    text,
int    tlen,
char **    cstring,
int *    clen,
int    Verbose
 

Compresses the text string according to the Spaceless model. It uses Compose_compressed_string() to put together all the infos and the Spaceless body. It sets the first (reserved) byte to 1, denoting the case of 'spaceless compression'.

Parameters:
text  pointer to the text string to be compressed.
tlen  length of the text string.
cstring  address of the pointer that will refer to the computed compressed string.
clen  pointer to the integer that will contain the length of the compressed string.
Verbose  if 1 the procedure prints infos on the compression process.

Definition at line 134 of file HuffwordLib.c.

void Huffw_spaceless_decompress char *    cstring,
int    clen,
char **    text,
int *    tlen,
int    Verbose
 

Decompresses the compressed string produced according to the Spaceless model. If the the first (reserved) byte is not 1, indicating not spaceless compression, an error is returned.

Parameters:
cstring  pointer to the compressed string.
clen  length of the compressed string.
text  address of the pointer to the uncompressed text string, to be computed.
tlen  pointer to the integer that will contain the length of the uncompressed text.
Verbose  if 1 the procedure prints infos on the compressed string.

Definition at line 183 of file HuffwordLib.c.


Generated on Mon Mar 31 14:44:31 2003 by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002