Main Page   Alphabetical List   Compound List   File List   Compound Members   File Members  

CGrepLib.c

Go to the documentation of this file.
00001 
00043 #include <sys/types.h>  // stat, getpid
00044 #include <sys/stat.h>   // stat
00045 #include <sys/wait.h>   // macros to check exit status
00046 #include <unistd.h>     // stat, getpid
00047 #include <fcntl.h>      // open
00048 
00049 #include <stdio.h> // fprintf
00050 
00051 #include "CGrepLib.h"
00052 
00053 /* used internally to represent a word hit */
00054 typedef struct _word_hit_t {
00055   size_t byte_position;
00056   size_t word_position;
00057 } word_hit_t;
00058 
00059 /* Whether cw1, of length len1, is the same as cw2, of length len2  */
00060 #define CGREP_CWMATCH(cw1, len1, cw2, len2) \
00061    (((len1)!=(len2))?0:(bcmp((void *) (cw1),(void *) (cw2),(len1)) == 0))
00062 
00085 int * CGrep_SearchPattern(int *nres, const char *ctext, size_t ctext_sz,
00086                           const char *pattern, char **options) {
00087   int text_len, compressed_body_len;
00088   char *compressed_body;
00089   Console console;
00090   int i = 0, j = 0;
00091   int ntokens;
00092   char *agrepOptions[CGREP_MAX_AGREPOPTIONS];
00093   MyHash_table tokens;
00094   char filter[256];
00095   int *positions;
00096 
00097   HDecompress_getBodyAndConsole((char *) ctext, ctext_sz, &text_len,
00098                                 &compressed_body, &compressed_body_len,
00099                                 &console);
00100 
00101 #ifdef DEBUG
00102   printf("TEST: dc: %d, dl: %d\n", strlen(console.dictionary.content),
00103          console.dictionary.length);
00104 #endif
00105 
00106   i = 0;
00107   /* set up the options array to execvp agrep */
00108   agrepOptions[i++] = CGREP_AGREP_EXECUTABLE;
00109   // agrepOptions[i++] = "-n";
00110 
00111   j = 0;
00112   while ( options[j] && (i < (CGREP_MAX_AGREPOPTIONS - 2)) )
00113     agrepOptions[i++] = (char *) options[j++];
00114 
00115   agrepOptions[i++] = (char *) pattern;
00116   agrepOptions[i] = NULL;
00117 
00118   /* Now, use the dictionary content to perform agrep */
00119   MyHashtable_init(&tokens, CGREP_HT_INITSIZE);
00120   bzero(filter, 256);
00121   ntokens = CGrep_GetMatchingCW(&tokens, filter, agrepOptions, 0, &console);
00122   
00123   if ( !ntokens ) { // nothing found in the dictionary, return
00124     *nres = 0;
00125     return(NULL);
00126   }
00127 
00128   /*
00129      - perform GetCWOccurrences over the compressed body
00130   */
00131   positions = CGrep_GetCWOccurrences(nres, &tokens, filter, compressed_body,
00132                                      compressed_body_len);
00133   MyHashtable_clear(&tokens);
00134   return(positions);
00135 }
00136 
00151 int *
00152 CGrep_SearchSubstring(int *nres, const char *pattern, const char *ctext,
00153                       size_t ctext_len, int errors)
00154 {
00155   char *options[CGREP_MAX_AGREPOPTIONS];
00156   char erropt[3+(int) log10(errors+1)];
00157   int i = 0;
00158   
00159   if ( errors ) {
00160     sprintf(erropt, "-%d", errors);
00161     options[i++] = erropt;
00162   }
00163   options[i] = NULL;
00164 
00165   return(CGrep_SearchPattern(nres, ctext, ctext_len, pattern, options));
00166 }
00167 
00182 int *
00183 CGrep_SearchWord(int *nres, const char *word, const char *ctext,
00184                  size_t ctext_len, int errors)
00185 {
00186   char *options[CGREP_MAX_AGREPOPTIONS];
00187   char erropt[3+(int) log10(errors+1)];
00188   int i = 0;
00189   
00190   options[i++] = "-w"; /* ask for word match */
00191   if ( errors ) {
00192     sprintf(erropt, "-%d", errors);
00193     options[i++] = erropt;
00194   }
00195   options[i] = NULL;
00196 
00197   return(CGrep_SearchPattern(nres, ctext, ctext_len, word, options));
00198 }
00199 
00200 int CGrep_GetMatchingCWFromFile(const char *fname, MyHash_table *ht,
00201                                 char *filter, char **options,
00202                                 int npattern, const Console *c);
00203 
00204 /* Invokes 'agrep' to obtain the list of separators within 'buf'. Agrep is
00205  *  invoked with parameters: agrep -n -v ^[0-9A-Za-z].
00206  *
00207  * @param dFileName name of the file with the dictionary
00208  * @param nseps pointer to an integer storing the length of the (allocated)
00209  *              array of the results. If != 0, it can be used to indicate
00210  *              the initial array size.
00211  * @param buf   pointer to the dictionary string
00212  * @param len   length of the dictionary string
00213  * @return an allocated array of integers indicating the positions of
00214  *         separators within buf
00215  */
00216 int
00217 CGrep_GetSeparators(char *dFileName, MyHash_table *ht, char *filter,
00218                     Console *c)
00219 {
00220   char *agrepOptions[CGREP_MAX_AGREPOPTIONS];
00221 
00222   int i = 0;
00223   agrepOptions[i++] = CGREP_AGREP_EXECUTABLE;
00224   // agrepOptions[i++] = "-n";
00225   agrepOptions[i++] = "-v";
00226   // agrepOptions[i++] = "-i";
00227   agrepOptions[i++] = "^[0-9A-Za-z]";
00228   agrepOptions[i] = NULL;
00229 
00230   /* Now, use the dictionary content to perform agrep */
00231   return CGrep_GetMatchingCWFromFile(dFileName, ht, filter, agrepOptions, -1,
00232                                      c);
00233 }
00234 
00235 void CGrep_WriteDictToFile(char *fname, int len, const Dictionary *d);
00236 
00237 
00261 proximity_hit_t *
00262 CGrep_SearchProximity(int *nres, const char *ctext, size_t ctext_sz,
00263                       int prox_window, char **patterns,
00264                       char ***options)
00265 {
00266   int text_len, compressed_body_len;
00267   char *compressed_body;
00268   Console console;
00269   int correction = 0, i = 0, j = 0;
00270   int ntokens;
00271   int npatterns;
00272   int nseparators;
00273   char *agrepOptions[CGREP_MAX_AGREPOPTIONS];
00274   MyHash_table tokens;
00275   MyHash_table sepTable;
00276   Hash_node *newline_node;
00277   char filter[256];
00278   char sepFilter[256];
00279   char dFileName[100];
00280   proximity_hit_t *ret;
00281 
00282   HDecompress_getBodyAndConsole((char *) ctext, ctext_sz, &text_len,
00283                                 &compressed_body, &compressed_body_len,
00284                                 &console);
00285   
00286   MyHashtable_init(&tokens, CGREP_HT_INITSIZE);
00287   bzero(filter, 256);
00288   bzero(sepFilter, 256);
00289 
00290 #ifdef DEBUG
00291   printf("TEST: dc: %d, dl: %d\n", strlen(console.dictionary.content),
00292          console.dictionary.length);
00293 #endif
00294 
00295   /* Is one of the dictionary tokens "\n"??? */
00296   if ( (newline_node = HHashtable_search("\n", 1, &(console.hashtable))) )
00297     HToken_RankFromTaggedCw(newline_node->codeword, newline_node->cw_len,
00298                             &(console.canoinfo),
00299                             &correction);
00300   else
00301     correction = -1;
00302 
00303   CGrep_WriteDictToFile(dFileName, 100, &(console.dictionary));
00304 
00305   /* for each pattern, invoke agrep and fill the relative hashmap */
00306   for ( npatterns = 0; patterns[npatterns]; npatterns++ ) {
00307     i = 0;
00308     /* set up the options array to execvp agrep */
00309     agrepOptions[i++] = CGREP_AGREP_EXECUTABLE;
00310     // agrepOptions[i++] = "-n";
00311     
00312     j = 0;
00313     while ( options[npatterns] && options[npatterns][j] &&
00314             (i < CGREP_MAX_AGREPOPTIONS) )
00315       agrepOptions[i++] = options[npatterns][j++];
00316 
00317     agrepOptions[i++] = patterns[npatterns];
00318     agrepOptions[i] = NULL;
00319 
00320     /* Now, use the dictionary content to perform agrep */
00321     ntokens = CGrep_GetMatchingCWFromFile(dFileName, &tokens, filter,
00322                                           agrepOptions, npatterns, &console);
00323     if ( !ntokens ) { // if no tokens are found for this pattern, we can return
00324       *nres = 0;
00325       return(NULL);
00326     }
00327   }
00328 
00329   /* find all separators */
00330   MyHashtable_init(&sepTable, 10*CGREP_HT_INITSIZE); /* TEN times larger?! */
00331   nseparators = CGrep_GetSeparators(dFileName, &sepTable, sepFilter, &console);
00332 
00333   /* done with the dictionary file, unlink it */
00334   unlink(dFileName);
00335 
00336   /*
00337     I now have the list of codewords corresponding to the patterns
00338     and to the separators. Now let's go find the occurrences...
00339   */
00340 
00341   if ( *ctext ) // spaceless model
00342     ret = CGrep_GetOccurrencesProximitySpaceless(nres, prox_window, npatterns,
00343                                                  &tokens, filter, &sepTable,
00344                                                  sepFilter,
00345                                                  compressed_body,
00346                                                  compressed_body_len);
00347   else
00348     /* the file keeps spaces as separators; optimize */
00349     ret = CGrep_GetOccurrencesProximity(nres, prox_window,
00350                                         npatterns, &tokens,
00351                                         filter, &sepTable,
00352                                         sepFilter,
00353                                         compressed_body,
00354                                         compressed_body_len,
00355                                         newline_node);
00356 
00357   MyHashtable_clear(&tokens);
00358   MyHashtable_clear(&sepTable);
00359   
00360   return(ret);
00361 }
00362 
00363 /* Dumps the dictionary to a file, whose name is stored in fname.
00364    len is the length of fname's buffer */
00365 void
00366 CGrep_WriteDictToFile(char *fname, int len, const Dictionary *d)
00367 {
00368   char pids[100]; /* actually pids have at most 5 chars... */
00369   int attempt = 0;
00370   int tmpfd;
00371   int written = 0;
00372 
00373   if ( len < 40 ) {
00374     fputs("Too small a buffer top hold the filename.", stderr);
00375     exit(2);
00376   }
00377 
00378   sprintf(pids, "%d", getpid());
00379 
00380   do
00381     sprintf(fname, "/tmp/agrep.tmp.%s.%d", pids, ++attempt);
00382   while ( ((tmpfd = open(fname, O_RDWR|O_CREAT|O_EXCL,
00383                          S_IREAD|S_IWUSR)) == -1) &&
00384           (errno == EEXIST) );
00385     
00386   if ( tmpfd == -1 ) {
00387     perror("open");
00388     exit(2);
00389   }
00390   
00391   do {
00392     written =  write(tmpfd, d->content + written, d->length - written);
00393     if ( written == -1 ) {
00394       perror("write");
00395       exit(2);
00396     }
00397   } while ( written < d->length );
00398   close(tmpfd);
00399 }
00400 
00401 int
00402 CGrep_Lookup(const char*ifname, const char*ofname, char **options)
00403 {
00404   char cmd[10000];
00405   int i;
00406   int status;
00407 
00408   strcpy(cmd, options[0]);
00409   strcat(cmd, " ");
00410   for (i = 1; options[i]; i++ ) {
00411     strcat(cmd, "'");
00412     strcat(cmd, options[i]);
00413     strcat(cmd, "' ");
00414   }
00415   
00416   strcat(cmd, ifname);
00417   strcat(cmd, " > ");  
00418   strcat(cmd, ofname);
00419 
00420 #ifdef DEBUG
00421   puts(cmd);
00422 #endif
00423   status = system(cmd);
00424   if ( (status == -1) ) { // system failed
00425     perror("system");
00426     exit(2);
00427   }
00428   if ( !WIFEXITED(status) ) {
00429     fprintf(stderr, "command \"%s\" failed.\n", cmd);
00430     exit(2);
00431   }
00432   switch ( WEXITSTATUS(status) ) {
00433   case 0: // everything normal (matches found)
00434     break;
00435   case 1: // ok, there were simply no matches
00436     break;
00437   default: // exit status not 0 and not 1: something failed
00438     fprintf(stderr, "system(\"%s\") exited with status %d\n",
00439             cmd, WEXITSTATUS(status));
00440     exit(2);
00441   }
00442 
00443   return(0);
00444 }
00445 
00446 /* Actually implements GetMatchingCW */
00447 int CGrep_GetMatchingCWFromFile(const char *fname, MyHash_table *ht,
00448                                 char *filter, char **options,
00449                                 int npattern, const Console *c)
00450 {  
00451   int i = 0;
00452   char pids[100] = "";
00453   FILE *istream;
00454   char obufs[1024];
00455   int attempt = 0;
00456   char *input_buf, *runner, *buf_end;
00457   size_t input_len;
00458   struct stat sbuf;
00459   int ntok = 0;
00460   int newLineSeen = 0;
00461 
00462   sprintf(pids, "%d", getpid());
00463   sprintf(obufs, "/tmp/agrep.out.%s.%d", pids, attempt);
00464 
00465   CGrep_Lookup(fname, obufs, options);
00466 
00467   stat(obufs, &sbuf);
00468   if ( (istream = fopen(obufs, "r")) == NULL ) {
00469     perror("fopen");
00470     exit(2);
00471   }
00472   input_len = sbuf.st_size;
00473 
00474   if ( input_len ) {
00475     input_buf = (char *) mmap(NULL,input_len,PROT_READ,MAP_SHARED,
00476                               fileno(istream),0);
00477     if ( (input_buf == MAP_FAILED) ) {
00478       perror("mmap");
00479       exit(2);
00480     }
00481     if ( input_buf == NULL ) {
00482       fprintf(stderr, "mmap returned NULL (but the filesize was >0)?!\n");
00483       exit(2);
00484     }
00485     
00486     
00487     buf_end = input_buf + input_len - 1;
00488     runner = input_buf;
00489     ntok = 0;
00490     do {
00491       char *end = runner;
00492       Hash_node *t;
00493       int hashcw;
00494       while ( *end != '\n' ) end++;
00495       if ( (end == runner) && !newLineSeen ) {
00496         while ( *(++end) == '\n' );
00497         end -= 1;
00498         if ( end - runner != 1 ) {
00499           fprintf(stderr,
00500                   "Error: messed up dictionary? (incoherent '\\n', line %d)\n",
00501                   ntok);
00502           exit(2);
00503         }
00504         // newLineSeen = 1;
00505       } else if ( end == runner ) {
00506         fprintf(stderr,
00507                 "Error: messed up dictionary? (incoherent '\\n', line %d)\n",
00508                 ntok);
00509         exit(2);
00510       }
00511       ntok++;
00512       t = HHashtable_search(runner, (end - runner),
00513                             (HHash_table *) &(c->hashtable));
00514       if ( !t ) { // something must have gone REALLY wrong, here...
00515         char wrongToken[end-runner+1];
00516         memcpy(wrongToken, runner, (end-runner));
00517         wrongToken[end-runner] = '\0';
00518         fprintf(stderr,
00519                 "Error: messed up dictionary? (token \"%s\" not found)\n",
00520                 wrongToken);
00521         exit(2);
00522       }
00523 #ifndef WORDS_BIGENDIAN
00524       /* revert the bytes in the codeword (if little-endian) */
00525       for ( i = 0; i < t->cw_len; i++ )
00526         *((char *) &hashcw + i) = *((char *) &t->codeword + t->cw_len - i - 1);
00527 #endif
00528 
00529       if ( filter )
00530         filter[*((unsigned char *) &hashcw)] = 1;
00531       
00532       MyHashtable_insert((char *) &hashcw, t->cw_len, npattern, ht
00533 #ifdef DEBUG
00534                          , runner, (end - runner)
00535 #endif
00536                          );
00537 
00538       runner = end + 1;
00539     } while ( runner < buf_end );
00540     munmap(input_buf, input_len);
00541   } // if ( input_len )
00542   fclose(istream);
00543 #ifndef DEBUG
00544   unlink(obufs);
00545 #else
00546   printf("N. results: %d\n", ntok);
00547 #endif
00548 
00549   return(ntok);
00550 }
00551 
00579 int
00580 CGrep_GetMatchingCW(MyHash_table *ht, char *filter, char **options,
00581                     int npattern, const Console *c)
00582 {
00583   char fname[100];
00584   int ret;
00585 
00586   CGrep_WriteDictToFile(fname, 100, &(c->dictionary));
00587   ret = CGrep_GetMatchingCWFromFile(fname, ht, filter, options, npattern, c);
00588   unlink(fname);
00589   return(ret);
00590 }
00591 
00606 int *
00607 CGrep_GetCWOccurrences(int *nocc, const MyHash_table *ht, const char *filter,
00608                        const char *cbody, size_t cbody_len)
00609 {
00610   int *position;
00611   const char *prevpos, *currpos = cbody;
00612   int currentResSize = 0;
00613   const char *endpos = cbody+cbody_len;
00614   register char v;
00615 
00616   currentResSize = CGREP_MAX(*nocc,CGREP_INIT_VECSIZE);
00617   position = (int *) malloc(currentResSize * sizeof(int));
00618 
00619   *nocc = 0;
00620   while ( currpos < endpos ) {
00621     prevpos = currpos;
00622     v = *(*((unsigned char *) prevpos) + filter);
00623 
00624     do {
00625       currpos++;
00626     } while ( (currpos<endpos) && ((*currpos & 0x80) == 0) );
00627     
00628     if ( v ) {
00629       if ( MyHashtable_search(prevpos, currpos-prevpos, ht) ) {
00630         /* we have found a new hit, record its position */
00631         
00632         if ( *nocc >= currentResSize ) {
00633           /* must reallocate the res array */
00634           currentResSize += (int) sqrt(currentResSize) + 1;
00635           position = (int *) realloc(position, currentResSize * sizeof(int));
00636         }
00637         
00638         position[(*nocc)++] = (int) (prevpos-cbody);
00639       }
00640     }
00641   }
00642   return(position);
00643 }
00644 
00660 const char *
00661 CGrep_GetNextCWOccurrence(int *len, MyHash_table *ht, const char *filter,
00662                           const char *cbody, size_t remaining)
00663 {
00664   const char *endpos = cbody+remaining;
00665   const char *prevpos;
00666   register char v;
00667 
00668   while ( cbody < endpos ) {
00669     prevpos = cbody;
00670     v = *(*((unsigned char *) prevpos) + filter);
00671 
00672     do {
00673       cbody++;
00674     } while ( (cbody<endpos) && ((*cbody & 0x80) == 0) );
00675     
00676     if ( v ) {
00677       if ( MyHashtable_search(prevpos, cbody-prevpos, ht) ) {
00678         /* we have found a new hit, return it */
00679         *len = cbody-prevpos;
00680         return(prevpos);
00681       }
00682     }
00683   }
00684   return(NULL);
00685 }
00686 
00687 /* Macro to move to the beginning of the next codeword. WARNING: uses
00688    open variables! */
00689 #define CGREP_GET_NEXT_CW() { \
00690     prevpos = currpos; \
00691     do { \
00692       currpos++; \
00693     } while ( (currpos<endpos) && ((*currpos & 0x80) == 0) ); \
00694     cw_len = currpos - prevpos; \
00695 }
00696 
00697 
00698 /* Checks whether cw, of length cw_len is a separator, checking first
00699    in the filter, and then on the hashtable */
00700 #define CGREP_CHECKIFSEPARATOR(cw, cw_len, filter, ht) \
00701    ((filter[*((unsigned char *) cw)]) && MyHashtable_search(cw, cw_len, ht))
00702 
00733 proximity_hit_t *
00734 CGrep_GetOccurrencesProximitySpaceless(int *nocc, int prox_window,
00735                                        int npatterns,
00736                                        const MyHash_table *ht,
00737                                        const char filter[],
00738                                        const MyHash_table *separators,
00739                                        const char sepFilter[],
00740                                        const char *cbody, size_t cbody_len)
00741 {
00742   proximity_hit_t *matches;
00743   word_hit_t patternHit[npatterns];    
00744   int i, k;
00745   int patternIndexWithMinPosition;
00746   size_t minWPosition, minBytePosition;
00747   size_t currWPosition = 0;
00748   const char *prevpos, *currpos = cbody;
00749   int currentResSize = 0;
00750   const char *endpos = cbody+cbody_len;
00751 #ifdef DEBUG
00752   int nseps = 0;
00753 #endif
00754   int cw_len = 0;
00755   MyHash_node *hn;
00756 
00757   currentResSize = CGREP_MAX(*nocc,CGREP_INIT_VECSIZE);
00758   matches = (proximity_hit_t *) malloc(currentResSize *
00759                                        sizeof(proximity_hit_t));
00760 
00761   for (i = 0; i < npatterns; i++ ) {
00762     patternHit[i].byte_position = 0;
00763     patternHit[i].word_position = 0;
00764   }
00765 
00766   *nocc = 0;
00767   patternIndexWithMinPosition = 0;
00768   minWPosition = 0;
00769   minBytePosition = 0;
00770 
00771   while ( currpos < endpos ) {
00772     CGREP_GET_NEXT_CW();
00773     
00774     /* first, check if it's a separator */
00775     if ( sepFilter[*(unsigned char *) prevpos] ) {
00776       if ( MyHashtable_search(prevpos, cw_len, separators) ) {
00777         /* it's a separator, skip! */
00778 #ifdef DEBUG
00779         nseps++;
00780 #endif
00781         continue;
00782       }
00783     }
00784     
00785     currWPosition++;
00786 
00787     hn = NULL;
00788     while ( (hn = CGrep_CheckIfIsPattern(prevpos, cw_len,
00789                                          &(ht[0]), hn, filter)) != NULL ) {
00790       i = hn->npattern;
00791       /* found a hit for the i-th pattern */
00792 #ifdef DEBUG
00793       printf("Found a hit for pattern %d, position %d\n", i, currWPosition);
00794 #endif
00795       patternHit[i].word_position = currWPosition;
00796       patternHit[i].byte_position = (int) (prevpos-cbody);
00797       if ( patternIndexWithMinPosition == i ) {
00798         /* find the new min */
00799         int j = 0;
00800         
00801         minWPosition = currWPosition;       
00802         for ( j = 0; j < npatterns; j++ ) {
00803           if ( minWPosition >= patternHit[j].word_position ) {
00804             patternIndexWithMinPosition = j;
00805             minWPosition = patternHit[j].word_position;
00806             minBytePosition = patternHit[j].byte_position;
00807           }
00808         }
00809       }
00810       
00811       /* see if we are within the proximity window (but only if
00812          all patterns have a match */
00813       if ( (minWPosition > 0) &&
00814            ((currWPosition - minWPosition) <= (size_t) prox_window) ) {
00815         /* found a new proximity hit! */
00816 #ifdef DEBUG
00817         printf("...and a proximity hit!\n");
00818 #endif
00819         if ( *nocc >= currentResSize ) {
00820           /* must reallocate the res array */
00821           currentResSize += (int) sqrt(currentResSize) + 1;
00822           matches = (proximity_hit_t *) realloc(matches,
00823                                                 currentResSize *
00824                                                 sizeof(proximity_hit_t));
00825         }
00826         
00827         matches[*nocc].byte_position = minBytePosition;
00828         matches[*nocc].start_position = minWPosition;
00829         matches[*nocc].end_position = currWPosition;
00830         for ( k = 0; k < npatterns; k++ ) {
00831           matches[*nocc].positions[k] = patternHit[k].byte_position;
00832           matches[*nocc].ranks[k] = patternHit[k].word_position;
00833         }
00834         (*nocc)++;
00835       }   
00836     }
00837   }
00838   
00839   /* if there were no matches, free the memory and return NULL */
00840   if ( !(*nocc) ) {
00841     free(matches);
00842     matches = NULL;
00843   }
00844 
00845   return(matches);
00846 }
00847 
00848 /*  Used to encode states in the automa used in
00849  *  CGrep_GetOccurrencesProximity.
00850  */
00851 typedef enum state {FOUND_WORD, FOUND_NEWLINE, FOUND_SEPARATOR} state_t;
00852 
00860 proximity_hit_t *
00861 CGrep_GetOccurrencesProximity(int *nocc, int prox_window,
00862                               int npatterns, const MyHash_table *ht,
00863                               const char filter[],
00864                               const MyHash_table *separators,
00865                               const char *sepFilter,
00866                               const char *cbody, size_t cbody_len,
00867                               const Hash_node *nl)
00868 {
00869   proximity_hit_t *matches;
00870   word_hit_t patternHit[npatterns];    
00871   int i;
00872   int patternIndexWithMinPosition;
00873   size_t minWPosition, minBytePosition;
00874   size_t currWPosition = 0;
00875   const char *prevpos, *currpos = cbody;
00876   int currentResSize = 0;
00877   const char *endpos = cbody+cbody_len;
00878   int cw_len = 0;
00879   int newline_cw = 0;
00880   int newline_cw_len;
00881   state_t state;
00882   MyHash_node *hn;
00883   int k;
00884 
00885   if ( !nl )
00886     newline_cw_len = 0; /* this way all matches against '\n' will return 0 */
00887   else {
00888     newline_cw = nl->codeword;
00889     newline_cw_len = nl->cw_len;
00890   }
00891 
00892   currentResSize = CGREP_MAX(*nocc,CGREP_INIT_VECSIZE);
00893   matches = (proximity_hit_t *) malloc(currentResSize *
00894                                        sizeof(proximity_hit_t));
00895 
00896   for (i = 0; i < npatterns; i++ ) {
00897     patternHit[i].byte_position = 0;
00898     patternHit[i].word_position = 0;
00899   }
00900 
00901   *nocc = 0;
00902   patternIndexWithMinPosition = 0;
00903   minWPosition = 0;
00904   minBytePosition = 0;
00905 
00906   CGREP_GET_NEXT_CW();
00907 
00908   state = FOUND_NEWLINE; /* safe state to begin with */
00909 
00910   do {
00911     switch ( state ) {
00912 
00913     case FOUND_SEPARATOR:
00914       /* first, check if we have a newline */
00915       if ( CGREP_CWMATCH(prevpos, cw_len, &newline_cw, newline_cw_len) ) {
00916         state = FOUND_NEWLINE;
00917         CGREP_GET_NEXT_CW();
00918         break;
00919       }
00920       state=FOUND_WORD;
00921       // break; // no need to break, can go straight to FOUND_WORD
00922       
00923     case FOUND_WORD:
00924       currWPosition++;
00925 
00926       hn = NULL;
00927       
00928       while ( (hn = CGrep_CheckIfIsPattern(prevpos, cw_len,
00929                                            ht, hn, filter)) != NULL ) {
00930         i = hn->npattern;
00931         patternHit[i].word_position = currWPosition;
00932         patternHit[i].byte_position = (int) (prevpos-cbody);
00933         if ( patternIndexWithMinPosition == i ) {
00934           /* find the new min */
00935           int j = 0;
00936           
00937           minWPosition = currWPosition;     
00938           for ( j = 0; j < npatterns; j++ ) {
00939             if ( minWPosition >= patternHit[j].word_position ) {
00940               patternIndexWithMinPosition = j;
00941               minWPosition = patternHit[j].word_position;
00942               minBytePosition = patternHit[j].byte_position;
00943             }
00944           }
00945         }
00946         
00947         /* see if we are within the proximity window (but only if
00948            all patterns have a match */
00949         if ( (minWPosition > 0) &&
00950              ((currWPosition - minWPosition) <= (size_t) prox_window) ) {
00951           /* found a new proximity hit! */
00952           if ( *nocc >= currentResSize ) {
00953             /* must reallocate the res array */
00954             currentResSize += (int) sqrt(currentResSize) + 1;
00955             matches = (proximity_hit_t *) realloc(matches,
00956                                                   currentResSize *
00957                                                   sizeof(proximity_hit_t));
00958           }
00959           
00960           matches[*nocc].byte_position = minBytePosition;
00961           matches[*nocc].start_position = minWPosition;
00962           matches[*nocc].end_position = currWPosition;
00963           for ( k = 0; k < npatterns; k++ ) {
00964             matches[*nocc].positions[k] = patternHit[k].byte_position;
00965             matches[*nocc].ranks[k] = patternHit[k].word_position;
00966           }
00967           (*nocc)++;
00968         }
00969       }
00970       
00971       CGREP_GET_NEXT_CW();
00972       if ( CGREP_CWMATCH(prevpos, cw_len, &newline_cw, newline_cw_len) ) {
00973         state = FOUND_NEWLINE;
00974         CGREP_GET_NEXT_CW();
00975         break;
00976       }
00977       state = FOUND_SEPARATOR;
00978       CGREP_GET_NEXT_CW();
00979       break;
00980       
00981     case FOUND_NEWLINE:
00982       /* first, check if we have another newline */
00983       if ( CGREP_CWMATCH(prevpos, cw_len, &newline_cw, newline_cw_len) ) {
00984         state = FOUND_NEWLINE;
00985         CGREP_GET_NEXT_CW();
00986         break;
00987       }
00988       /* otherwise, check to see if we have a separator */
00989       if ( CGREP_CHECKIFSEPARATOR(prevpos, cw_len, sepFilter, separators) ) {
00990         state=FOUND_SEPARATOR;
00991         CGREP_GET_NEXT_CW();
00992         break;
00993       }
00994       /* otherwise, we have a word for sure: go to the relevant state */
00995       state=FOUND_WORD;
00996       break;
00997     }
00998   } while ( (currpos < endpos) );
00999   
01000   if ( state == FOUND_WORD ) {
01001     /* Check the last word, but warning: it might have been checked already */
01002   }
01003 
01004   /* Question: is the LAST codeword found alright?!?! */
01005 
01006   if ( !(*nocc) ) { // free matches and return NULL
01007     free(matches);
01008     matches = NULL;
01009   }
01010 
01011   return(matches);
01012 }
01013 
01032 MyHash_node *
01033 CGrep_CheckIfIsPattern(const char *cbody, int cw_len, const MyHash_table *ht,
01034                        MyHash_node *hn, const char filter[])
01035 {
01036   if ( !hn ) {
01037     if ( filter[*(unsigned char *) cbody] ) {
01038       // first, use the filters
01039       hn = ht->table[MyHashtable_func(cbody,cw_len,ht)];
01040       while ( hn && 
01041               !CGREP_CWMATCH(cbody, cw_len, hn->str, hn->len_str) )
01042         hn = hn->next;
01043     } else
01044       return(NULL);
01045   } else { // keep scanning the hashtable
01046     do {
01047       hn = hn->next;
01048     } while ( hn && 
01049               !CGREP_CWMATCH(cbody, cw_len, hn->str, hn->len_str) );
01050   }
01051   return(hn);
01052 }
01053 
01068 char *CGrep_escapeStringConfigurable(const char *s, size_t len, char min,
01069                                      char max, const char *exceptions)
01070 {
01071   int sz = 5 * len + 1;
01072   char *ret = (char *) malloc(sz);
01073   const char *end = s+len;
01074   const char *bgn = ret;
01075   char *curr = ret;
01076   if ( !ret ) {
01077     perror("malloc");
01078     exit(2);
01079   }
01080   while ( s < end ) {
01081     if ( (curr - bgn) >= (sz-5) ) { // resize
01082       sz = (sz * 0.5) + 5;
01083       ret = realloc(ret, sz);
01084       if ( !ret ) {
01085         perror("realloc");
01086         exit(2);
01087       }
01088       curr = ret + (curr - bgn);
01089       bgn = ret;
01090     }
01091     if ( (*s < min) || (*s > max) || strchr(exceptions, *s) ) {
01092       sprintf(curr, "[\\%x]", *s);
01093       while ( *curr ) curr++;
01094     } else
01095       *curr++ = *s;
01096     s++;
01097   }
01098   *curr = 0;
01099   return(ret);
01100 }
01101 
01106 char *CGrep_escapeString(const char *s, size_t len)
01107 {
01108   return(CGrep_escapeStringConfigurable(s, len, CGREP_MIN_PRINTABLE_CHAR,
01109                                         CGREP_MAX_PRINTABLE_CHAR,
01110                                         CGREP_NONPRINTABLE_CHARS));
01111 }
01112 
01113 
01114 /* ---------------------------------------------------- */
01115 /* -------------------  MyHash Table    ----------------- */
01116 /* ---------------------------------------------------- */
01117 
01124 void MyHashtable_init(MyHash_table *ht, int n)
01125 {
01126   int i;
01127   ht->size  = n * 10;     // Load factor 0.1
01128   ht->table = (MyHash_nodeptr_array) malloc(ht->size * sizeof(MyHash_node*));
01129   ht->card = 0;
01130   if (ht->table == NULL) {
01131     fprintf(stderr,"Fatal Error: MyHash table allocation\n");
01132     exit(2);
01133   }
01134 
01135   /* memset(ht->table, 0, ht->size * sizeof(MyHash_node)); */
01136   /* A cleaner way (albeit slower): */
01137   for ( i = 0; i < ht->size; ht->table[i++] = NULL );
01138 }
01139 
01145 void MyHashtable_clear(MyHash_table *ht)
01146 {
01147   int i;
01148 
01149   for ( i = 0; i < ht->size; i++ ) {
01150     MyHash_node *hn = ht->table[i];
01151     while ( hn ) {
01152       MyHash_node *toFree = hn;
01153       hn = hn->next;
01154       free(toFree);
01155     }
01156   }
01157   ht->card = 0;
01158   ht->size = 0;
01159   free(ht->table);
01160 }
01161 
01167 int MyHashtable_func(const char *s, int len, const MyHash_table *ht)
01168 {
01169   register int hfn;
01170   int hfi;
01171   int table_size = ht->size;
01172 
01173   hfn = 11;
01174   for (hfi=0; hfi<len ; hfi++)
01175     hfn = hfn ^ ((hfn<<5) + (hfn>>2) + (unsigned char) *s++);
01176   hfn = abs(hfn % table_size);
01177   return(hfn);
01178 }
01179 
01186 MyHash_node *MyHashtable_search(const char *s, int slen,
01187                                 const MyHash_table *ht)
01188 {   
01189   MyHash_node *ghsp;
01190 
01191   ghsp = ht->table[MyHashtable_func(s,slen,ht)];
01192 
01193   while ( ghsp ) {
01194     if ( CGREP_CWMATCH(s, slen, ghsp->str, ghsp->len_str) )
01195       return(ghsp);
01196     ghsp = ghsp->next;
01197   }
01198   return(NULL);
01199 }
01200 
01211 int MyHashtable_insert(const char *s, int slen, int npattern, MyHash_table *ht
01212 #ifdef DEBUG
01213                        , char *word, int len
01214 #endif
01215                        )   
01216      
01217 {   
01218   int hiv;
01219   int i;
01220   MyHash_node *hip;
01221   MyHash_node *table_head;
01222   MyHash_node *hip_pred;
01223 
01224 
01225   hiv = MyHashtable_func(s,slen,ht);  // compute the hash value
01226   hip = MyHashtable_search(s,slen,ht);   // check string occurrence
01227 
01228   if (hip) {
01229     
01230     // move-to-front the searched token
01231     if (ht->table[hiv] != hip){
01232 
01233       // Find hip predecessor which surely does exist
01234       // because hip is not the head of the list
01235       for (hip_pred = ht->table[hiv]; hip_pred->next != hip; 
01236            hip_pred = hip_pred->next) ;
01237 
01238       hip_pred->next = hip->next;   // jump hip in the old list
01239       table_head = ht->table[hiv];   // save the old head of the list-entry
01240       ht->table[hiv] = hip;        // hip is the new head of the list
01241       hip->next = table_head;         
01242     }
01243 
01244     return(0);
01245 
01246   } else {    //------ The token is new -------
01247     hip = (MyHash_node *) malloc(sizeof(MyHash_node));
01248     if (hip == NULL) {
01249       fprintf(stderr,"Error: Insert hash table\n");
01250       exit(2);
01251     } 
01252 
01253     hip->len_str = slen;
01254     for ( i = 0; i < slen; i++ )
01255       hip->str[i] = s[i];
01256     hip->npattern = npattern;
01257     hip->next = ht->table[hiv];
01258 #ifdef DEBUG
01259     strncpy(hip->word, word, len);
01260     hip->word[len] = 0;
01261 #endif
01262     ht->table[hiv] = hip;
01263     ht->card += 1;
01264     return(1);
01265   }
01266 }
01267 

Generated on Mon Mar 31 14:44:31 2003 by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002