#include "HuffwordLib.h"
Go to the source code of this file.
Functions | |
void | Huffw_compress (char *text, int tlen, int jump_value, char **cstring, int *clen, int Verbose) |
void | Huffw_decompress (char *cstring, int clen, char **text, int *tlen, int Verbose) |
void | Huffw_spaceless_compress (char *text, int tlen, char **cstring, int *clen, int Verbose) |
void | Huffw_spaceless_decompress (char *cstring, int clen, char **text, int *tlen, int Verbose) |
void | Huffw_PrintInfo (Console *c, int ctext_len, int text_len, int cstring_len, int rule, int Verbose) |
void | HParse_text (char *text, int text_len, HHash_table *ht) |
void | HCompress_getBody (char *text, int text_len, HHash_table *ht, char **ctext, int *ctext_len) |
void | HCompress_getBodyAndConsole (char *text, int tlen, int jump_value, char **body, int *blen, Console *console) |
void | HCompress_getString (char **ctext, int *ctext_len, int text_len, char *body, int blen, Console *console) |
void | HCompress_contractSpaces (char *body, int blen, char **body_spaceless, int *blen_spaceless, Console *console) |
void | HDecompress_getBodyAndConsole (char *ctext, int ctext_len, int *text_len, char **body, int *blen, Console *console) |
void | HDecompress_expandSpaces (char *body_spaceless, int blen_spaceless, char **body, int *blen, Console *console) |
void | HDecompress_nextBlock_bytes (char *ctext, int num_bytes, char **text, int *text_len, Console *console) |
void | HDecompress_nextBlock_bytes_spaceless (char *ctext, int num_bytes, char **text, int *text_len, Console *console) |
void | HDecompress_previousBlock_bytes (char *ctext, int num_bytes, char **text, int *text_len, Console *console) |
void | HDecompress_previousBlock_bytes_spaceless (char *ctext, int num_bytes, char **text, int *text_len, Console *console) |
void | HDecompress_nextBlock_tokens (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console) |
void | HDecompress_nextBlock_tokens_spaceless (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console) |
void | HDecompress_previousBlock_tokens (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console) |
void | HDecompress_previousBlock_tokens_spaceless (char *ctext, int bytes_left, int num_obj, char **text, int *text_len, int *decoded, Console *console) |
void | HToken_RankFromPlainCw (int plaincw, int plaincwlen, Canonical *cano, int *token_rank) |
void | HToken_RankFromTaggedCw (int taggedcw, int taggedcwlen, Canonical *cano, int *token_rank) |
void | HToken_fromTaggedCw (int taggedcw, int taggedcwlen, Canonical *cano, Dictionary *dict, char **token, int *token_len) |
int | HToken_decompressNext (char *s, int num_byte_left, char **token, int *lentoken, int *lencw, Console *console) |
int | HToken_decompressPrevious (char *s, int num_byte_left, char **token, int *lentoken, int *lencw, Console *console) |
void | HToken_getLengthNext (char *s, int num_char_left, int *len) |
void | HCodeword_PlainFromTagged (int taggedcw, int taggedcwlen, int *plaincw, int *plaincwlen) |
void | HCodeword_PlainFromTokenrank (int token_rank, Canonical *cano, int *plaincw, int *plaincwlen) |
void | HCodeword_TaggedFromPlain (int plaincw, int plaincwlen, int *taggedcw, int *taggedcwlen) |
void | HCodeword_TaggedFromTokenrank (int token_rank, Canonical *cano, int *taggedcw, int *taggedcwlen) |
void | HCodeword_TaggedFromToken (char *token, int token_len, HHash_table *ht, int *taggedcw, int *taggedcwlen) |
void | HCodeword_TaggedGetNext (char *s, int num_byte_left, int *taggedcw, int *lencw) |
int | HCodeword_TaggedGetNextLength (char *s, int num_byte_left) |
void | HCodeword_TaggedGetPrevious (char *s, int num_byte_left, int *taggedcw, int *lencw) |
void | HCodeword_tostring (int taggedcw, int taggedcwlen, char **s) |
void | HHashtable_init (HHash_table *ht, int n) |
void | HHashtable_print (HHash_table *ht) |
int | HHashtable_func (char *s, int len, HHash_table *ht) |
Hash_node * | HHashtable_search (char *s, int slen, HHash_table *ht) |
int | HHashtable_insert (char *s, int slen, HHash_table *ht) |
void | HHashtable_clear (HHash_table *ht) |
void | HHashtable_fromdict (Dictionary *dict, Canonical *cano, HHash_table *ht) |
void | HDictionary_fromstring (char *s, int slen, int stokens, Dictionary *dict) |
void | HDictionary_fromtree (Hash_nodeptr_array tree, int num_tokens, Dictionary *dict) |
void | HDictionary_print (Dictionary *dict, HHash_table *ht, int Verbose) |
void | HJumpers_fromds (char *text, int tlen, HHash_table *ht, int jump_value, Jumpers *jumpers) |
void | HJumpers_tostring (char *s, Jumpers *jumpers) |
void | HJumpers_fromstring (char *s, Jumpers *jumpers) |
void | Get_charpos_from_bytepos (char *ctext, int ctext_len, int bytepos, int *textpos, Console *console) |
void | Get_bytepos_from_charpos (char *ctext, int ctext_len, int textpos, int *bytepos, Console *console) |
void | HCanonical_fromtree (Hash_nodeptr_array tree_array, Canonical *cano, HHash_table *ht) |
void | HCanonical_fromstring (char *s, Canonical *cano) |
void | HCanonical_tostring (char *s, Canonical *cano) |
void | Hufftree_createLeaves (Hash_nodeptr_array tree, int num_leaves, HHash_table *ht) |
int | Hufftree_fromLeaves (Hash_nodeptr_array work_area, int *tot_nodes, int leaves) |
int | Hufftree_computeCwLen (Hash_nodeptr_array tree, int root, int tree_size) |
void | Hufftree_build (Hash_nodeptr_array *tree_array_ptr, HHash_table *ht) |
void | Hufftree_print (Canonical *cano, Hash_nodeptr_array tree_array, int Verbose) |
void | HPrint_string (char *s, int l) |
void | HInt_tostring (char *s, int i) |
int | HInt_fromstring (char *s) |
int | HSort_for_freq (const void *va, const void *vb) |
int | HSort_for_cwlen (const void *va, const void *vb) |
Definition in file HuffwordLib.c.
|
Computes the byte position (counting from 0) in the compressed text corresponding to a given char position (counting from 0) into the uncompressed text. We assume that the text position refers to the beginning of a token.
Definition at line 2291 of file HuffwordLib.c. |
|
Computes the text position (counting from 0) corresponding to a given byte position (counting from 0) in the compressed text. We assume that the byte position corresponds to the beginning of a codeword.
Definition at line 2239 of file HuffwordLib.c. |
|
Computes the canonical infos from a string, assuming the sequence max_cwlen,firstcw[1],offsetcw[1], .., firstcw[4],offsetcw[4]. Since they are just 9 integers (36 bytes) we do not use any succinct encoding nor summarization.
Definition at line 2436 of file HuffwordLib.c. |
|
Computes the canonical huffman infos and updates the hash table by setting for each token its byte-aligned and tagged codeword and its codeword length in bytes.
Definition at line 2347 of file HuffwordLib.c. |
|
Stores the canonical infos into a (preallocated) string s, assuming the sequence max_cwlen,firstcw[1],offsetcw[1], ...., firstcw[4],offsetcw[4]. Since they are just 9 integers we do not use any succinct encoding nor summarization. The procedure reads just the first 36 bytes from s.
Definition at line 2463 of file HuffwordLib.c. |
|
Computes the plain codeword corresponding to an input tagged and byte-aligned codeword.
Definition at line 1443 of file HuffwordLib.c. |
|
Computes the plain codeword of a token from its rank in the dictionary. If some codeword length is absent then its firstcw[]=+infty.
Definition at line 1472 of file HuffwordLib.c. |
|
Computes the tagged and byte-aligned codeword from its plain counterpart.
Definition at line 1497 of file HuffwordLib.c. |
|
Computes the tagged and byte-aligned codeword for the passed token.
Definition at line 1553 of file HuffwordLib.c. |
|
Computes the byte-aligned and tagged codeword of a token given its rank in the dictionary (starts from 0). Recall that if some codeword length is absent then its firstcw[]=+infty.
Definition at line 1525 of file HuffwordLib.c. |
|
Reads a sequence of bytes corresponding to the next tagged codeword in s. Its length, in bytes, is returned in lencw; the codeword is assigned to *taggedcw and allocated into its least significant bytes.
Definition at line 1578 of file HuffwordLib.c. |
|
Returns the length of the tagged codeword starting at s.
Definition at line 1609 of file HuffwordLib.c. |
|
Reads the sequence of bytes preceding the one pointed to by s and corresponding to the previous tagged codeword. The codeword length is returned in lencw (it is smaller than 4); the codeword is assigned to *taggedcw and allocated into its least significant bytes.
Definition at line 1646 of file HuffwordLib.c. |
|
Transforms a (tagged) codeword into a string of bytes (ended by \0).
Definition at line 1682 of file HuffwordLib.c. |
|
Removes the single spaces which occur between two consecutive alphanumeric tokens. This is the so called Spaceless model.
Definition at line 511 of file HuffwordLib.c. |
|
Construct the byte-aligned and tagged Huffword string, exploiting the tagged and byte-aligned codewords available in the hash-table data structure.
Definition at line 317 of file HuffwordLib.c. |
|
Builds all the data structures needed for the Huffword compression. Actually it builds the Huffman tree, and then computes the Canonical data structure, the Dictionary and derives the compressed body string as well the Jumpers, if jump_value is greater than zero.
Definition at line 387 of file HuffwordLib.c. |
|
Composes the entire compressed string allocating it from scratch. It will be pointed to by *ctext and of length *ctext_len. Recall that offsetcw[i] is set ideally to +infty if no codeword of length 'i' is present in the dictionary. The composed string is formed as follows:
Definition at line 449 of file HuffwordLib.c. |
|
Re-insert the single spaces that were dropped during the Spaceless compression. The programmer must take care to apply this procedure in the right case.
Definition at line 674 of file HuffwordLib.c. |
|
Initializes all data structures and the Console loading their content from the compressed string (including the header, dictionary, ...). Recall that it is offsetcw[i] > tokens if does not exist any codeword of length i. The hash table contains for each token its corresponding tagged codeword and its length in bytes.
Definition at line 609 of file HuffwordLib.c. |
|
Decompresses a piece of compressed text for the specified number of bytes. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.
Definition at line 761 of file HuffwordLib.c. |
|
Decompresses a piece of compressed text for the specified number of bytes in the Spaceless model. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist and that we are in the case of the spaceless compression (ie. first reserved byte is 1).
Definition at line 827 of file HuffwordLib.c. |
|
Decompresses a piece of compressed text for a specified number of alphanumeric tokens. The Console data structure must be initialized. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.
Definition at line 1004 of file HuffwordLib.c. |
|
Decompresses a piece of compressed text for a specified number of alphanumeric tokens in the spaceless model. The Console data structure must be initialized. The programmer must ensure that the compression type is 'spaceless' (ie. first reserved byte is 1).
Definition at line 1061 of file HuffwordLib.c. |
|
Decompresses a piece of compressed text lying to the left of "ctext" for a specified number of bytes. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.
Definition at line 875 of file HuffwordLib.c. |
|
Decompresses a piece of compressed text to the left of "ctext" pointer for "num_bytes" bytes in the spaceless model. The Console data structure must be initialized. The programmer must ensure that the requested number of bytes do exist and that the string has been compressed via 'spaceless model' (ie. first reserved byte is 1).
Definition at line 952 of file HuffwordLib.c. |
|
Decompresses the piece of compressed text that precedes "ctext" and for a specified number of alphanumeric tokens. The Console data structure must be initialized. It does not work well on the spaceless model (ie. first reserved byte is 1). In this case the decompressed string is without spaces between alphanumeric tokens. Here, the programmer must use the proper procedure.
Definition at line 1115 of file HuffwordLib.c. |
|
Decompresses the piece of compressed text that precedes "ctext" and for a specified number of alphanumeric tokens in the Spaceless model. The Console data structure must be initialized. The programmer must ensure that the compression type is 'spaceless' (ie. first reserved byte is 1).
Definition at line 1176 of file HuffwordLib.c. |
|
Computes from a given string containing the tokens separated by 'newline' the dictionary data structure (the last token is also ended by 'newline'). The string must be kept since the CONTENT field refers to it.
Definition at line 1954 of file HuffwordLib.c. |
|
Computes the dictionary data structure from the leaves of a Huffman tree, they are ordered by codeword length and alphabetically.
Definition at line 1998 of file HuffwordLib.c. |
|
Print the dictionary content and, also, the codewords for every token according to the format token:cw cwlen if Verbose = 1.
Definition at line 2058 of file HuffwordLib.c. |
|
Frees all elements of a hashtable. After this call, ht is an empty, uninitialized HHash_table.
Definition at line 1871 of file HuffwordLib.c. |
|
Computes an index data structure for the dictionary tokens, based on a hash table and the canonical info; each entry of the hash table contains also the corresponding tagged and byte-aligned codeword and its length (as bytes).
Definition at line 1900 of file HuffwordLib.c. |
|
Computes the hash value for the given string. The function was proposed by Ramakrishnais and Zobel in a paper appeared in: Int. Conf. on DB Systems for advanced applications, 1997.
Definition at line 1776 of file HuffwordLib.c. |
|
Initialize the hash table according to the number of estimated tokens; the load factor is set to 10 and the lists are managed via the MTF-rule.
Definition at line 1722 of file HuffwordLib.c. |
|
Inserts the token in the hash table and returns 1 if new, 0 otherwise; it also updates the counter of occurrences for that token.
Definition at line 1815 of file HuffwordLib.c. |
|
Print the content of the hash table ht: token, length, count occurrences, tagged cw and its length.
Definition at line 1747 of file HuffwordLib.c. |
|
Searches for the given string into the passed hash table (NULL if not).
Definition at line 1795 of file HuffwordLib.c. |
|
Reads the first 4 bytes of string s and interprets them as an integer.
Definition at line 2859 of file HuffwordLib.c. |
|
Stores the integer "i" into the first 4 bytes of "s".
Definition at line 2848 of file HuffwordLib.c. |
|
Computes the jumper data structure according to a number of skipped chars. Since the skipping may end up into a word, then we need to store two integers, the offset in the uncompressed text and the offset into the compressed text.
Definition at line 2112 of file HuffwordLib.c. |
|
Loads the number of jumpers, and then the offsets in the text and the offsets in the compressed text from the string s, alternatively.
Definition at line 2199 of file HuffwordLib.c. |
|
Serializes the Jumper data structure by storing in the string s the offsets in the text and the offsets in the compressed text, alternatively; it appends to the front of the string s the number of jumpers it has written.
Definition at line 2170 of file HuffwordLib.c. |
|
Parses the text using the procedure HToken_getLengthNext() and fills in the hash table with items containing for each token, its number of occurrences in the text (count_occ), the token string (str) and its length (len_str). These infos will be then used by the procedure to construct the Huffman tree. The programmer has to take care of the initialization of the hash table, via HHashtable_init(). This way, it is possible to parse a sequence of texts and build one unique Huffamn tree for them.
Definition at line 273 of file HuffwordLib.c. |
|
Prints the first l chars of the string s by using [x] to denote an unprintable ASCII value x.
Definition at line 2803 of file HuffwordLib.c. |
|
Definition at line 2883 of file HuffwordLib.c. |
|
Definition at line 2878 of file HuffwordLib.c. |
|
Decompresses the next token starting from the byte position indicated by s. It also computes the length of this token and the length (in bytes) of the decompressed codeword. The procedure returns 0 if the operation was applied on <=0 bytes, otherwise it returns 1.
Definition at line 1333 of file HuffwordLib.c. |
|
Decompresses the token whose codeword precedes the byte position indicated by s. It also computes the length of this token and the length (in bytes) of the decompressed codeword. The procedure returns 0 if the operation was applied on <=0 bytes, otherwise it returns 1.
Definition at line 1366 of file HuffwordLib.c. |
|
Determines the token corresponding to a given tagged codeword.
Definition at line 1290 of file HuffwordLib.c. |
|
Returns the length of a token defined as either a sequence of letters and numbers, or as a sequnece of separators different of 'newline', or as a single 'newline'. This distinction is introduced because the Dictionary string is formed by putting one token per line, hence if 'newline' occurs inside a token than problems may arise.
Definition at line 1397 of file HuffwordLib.c. |
|
Computes the position in the dictionary of the token having the passed plain codeword. Recall that the dictionary is ordered by decreasing codeword length and alphabetically among equally-long codewords.
Definition at line 1235 of file HuffwordLib.c. |
|
Computes the position in the dictionary of the token having the passed tagged codeword. Recall that the dictionary is ordered by decreasing codeword length and alphabetically among equally-long codewords.
Definition at line 1260 of file HuffwordLib.c. |
|
Constructs the Huffman tree with a fan-out 128, over the set of tokens contained into the hash table HT. It returns the pointer to the array of tree leaves ordered by decreasing codeword length and alphabetically among equally-long codewords.
Definition at line 2688 of file HuffwordLib.c. |
|
Computes the codeword lengths by visiting top-down the Huffman tree. The depth of a node is propagated to its children, incremented by one; these children are stored contiguously and the position of the leftmost child is kept in the field "next" of the Hash_node pointed to by a tree node. At the end the leaves are stored beginning at position "root", are sorted for decreasing codeword length and among equally-long cw they are sorted by alphabetic order. The procedure returns the maximum codeword length just computed.
Definition at line 2633 of file HuffwordLib.c. |
|
Uses a hash table containing all the parsed tokens, an array of pointers to the the leaves of the Huffman tree (to be constructed), and an integer indicating how many leaves should be formed including the dummy ones, whose count_occ is set to 0 and str = NULL.
Definition at line 2499 of file HuffwordLib.c. |
|
Builds an Huffman tree with fan-out 128 starting from the array of leaves (pointers to Hash_nodes) stored in work_area. The algorithm uses the trick if Kunth which exploits to queues kept sorted by increasing frequency of nodes. The leaves are initially allocated to the beginning of work_area; processed nodes are moved to the end of work_area. The procedure eventually returns the position in work_area where the root of the built Huffman tree is allocated.
Definition at line 2540 of file HuffwordLib.c. |
|
Print infos about the Canonical Huffword code. Recall that we have offsetcw[] > num_tokens if the corresponding cwlen is not occurring.
Definition at line 2751 of file HuffwordLib.c. |
|
Compresses the text string by using Compose_compressed_string() and setting the first (reserved) byte of the produced string to 0, denoting the case of 'plain compression'.
Definition at line 50 of file HuffwordLib.c. |
|
Decompresses the compressed string; if the first (reserved) byte is not 0, indicating not plain compression, an error is returned.
Definition at line 93 of file HuffwordLib.c. |
|
Print compression infos. It is offsetcw[] > num_tokens if the corresponding cwlen is not occurring. If Verbose > 1 also the dictionary tokens and their codewords are printed.
Definition at line 225 of file HuffwordLib.c. |
|
Compresses the text string according to the Spaceless model. It uses Compose_compressed_string() to put together all the infos and the Spaceless body. It sets the first (reserved) byte to 1, denoting the case of 'spaceless compression'.
Definition at line 134 of file HuffwordLib.c. |
|
Decompresses the compressed string produced according to the Spaceless model. If the the first (reserved) byte is not 1, indicating not spaceless compression, an error is returned.
Definition at line 183 of file HuffwordLib.c. |