00001
00043 #include <sys/types.h>
00044 #include <sys/stat.h>
00045 #include <sys/wait.h>
00046 #include <unistd.h>
00047 #include <fcntl.h>
00048
00049 #include <stdio.h>
00050
00051 #include "CGrepLib.h"
00052
00053
00054 typedef struct _word_hit_t {
00055 size_t byte_position;
00056 size_t word_position;
00057 } word_hit_t;
00058
00059
00060 #define CGREP_CWMATCH(cw1, len1, cw2, len2) \
00061 (((len1)!=(len2))?0:(bcmp((void *) (cw1),(void *) (cw2),(len1)) == 0))
00062
00085 int * CGrep_SearchPattern(int *nres, const char *ctext, size_t ctext_sz,
00086 const char *pattern, char **options) {
00087 int text_len, compressed_body_len;
00088 char *compressed_body;
00089 Console console;
00090 int i = 0, j = 0;
00091 int ntokens;
00092 char *agrepOptions[CGREP_MAX_AGREPOPTIONS];
00093 MyHash_table tokens;
00094 char filter[256];
00095 int *positions;
00096
00097 HDecompress_getBodyAndConsole((char *) ctext, ctext_sz, &text_len,
00098 &compressed_body, &compressed_body_len,
00099 &console);
00100
00101 #ifdef DEBUG
00102 printf("TEST: dc: %d, dl: %d\n", strlen(console.dictionary.content),
00103 console.dictionary.length);
00104 #endif
00105
00106 i = 0;
00107
00108 agrepOptions[i++] = CGREP_AGREP_EXECUTABLE;
00109
00110
00111 j = 0;
00112 while ( options[j] && (i < (CGREP_MAX_AGREPOPTIONS - 2)) )
00113 agrepOptions[i++] = (char *) options[j++];
00114
00115 agrepOptions[i++] = (char *) pattern;
00116 agrepOptions[i] = NULL;
00117
00118
00119 MyHashtable_init(&tokens, CGREP_HT_INITSIZE);
00120 bzero(filter, 256);
00121 ntokens = CGrep_GetMatchingCW(&tokens, filter, agrepOptions, 0, &console);
00122
00123 if ( !ntokens ) {
00124 *nres = 0;
00125 return(NULL);
00126 }
00127
00128
00129
00130
00131 positions = CGrep_GetCWOccurrences(nres, &tokens, filter, compressed_body,
00132 compressed_body_len);
00133 MyHashtable_clear(&tokens);
00134 return(positions);
00135 }
00136
00151 int *
00152 CGrep_SearchSubstring(int *nres, const char *pattern, const char *ctext,
00153 size_t ctext_len, int errors)
00154 {
00155 char *options[CGREP_MAX_AGREPOPTIONS];
00156 char erropt[3+(int) log10(errors+1)];
00157 int i = 0;
00158
00159 if ( errors ) {
00160 sprintf(erropt, "-%d", errors);
00161 options[i++] = erropt;
00162 }
00163 options[i] = NULL;
00164
00165 return(CGrep_SearchPattern(nres, ctext, ctext_len, pattern, options));
00166 }
00167
00182 int *
00183 CGrep_SearchWord(int *nres, const char *word, const char *ctext,
00184 size_t ctext_len, int errors)
00185 {
00186 char *options[CGREP_MAX_AGREPOPTIONS];
00187 char erropt[3+(int) log10(errors+1)];
00188 int i = 0;
00189
00190 options[i++] = "-w";
00191 if ( errors ) {
00192 sprintf(erropt, "-%d", errors);
00193 options[i++] = erropt;
00194 }
00195 options[i] = NULL;
00196
00197 return(CGrep_SearchPattern(nres, ctext, ctext_len, word, options));
00198 }
00199
00200 int CGrep_GetMatchingCWFromFile(const char *fname, MyHash_table *ht,
00201 char *filter, char **options,
00202 int npattern, const Console *c);
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216 int
00217 CGrep_GetSeparators(char *dFileName, MyHash_table *ht, char *filter,
00218 Console *c)
00219 {
00220 char *agrepOptions[CGREP_MAX_AGREPOPTIONS];
00221
00222 int i = 0;
00223 agrepOptions[i++] = CGREP_AGREP_EXECUTABLE;
00224
00225 agrepOptions[i++] = "-v";
00226
00227 agrepOptions[i++] = "^[0-9A-Za-z]";
00228 agrepOptions[i] = NULL;
00229
00230
00231 return CGrep_GetMatchingCWFromFile(dFileName, ht, filter, agrepOptions, -1,
00232 c);
00233 }
00234
00235 void CGrep_WriteDictToFile(char *fname, int len, const Dictionary *d);
00236
00237
00261 proximity_hit_t *
00262 CGrep_SearchProximity(int *nres, const char *ctext, size_t ctext_sz,
00263 int prox_window, char **patterns,
00264 char ***options)
00265 {
00266 int text_len, compressed_body_len;
00267 char *compressed_body;
00268 Console console;
00269 int correction = 0, i = 0, j = 0;
00270 int ntokens;
00271 int npatterns;
00272 int nseparators;
00273 char *agrepOptions[CGREP_MAX_AGREPOPTIONS];
00274 MyHash_table tokens;
00275 MyHash_table sepTable;
00276 Hash_node *newline_node;
00277 char filter[256];
00278 char sepFilter[256];
00279 char dFileName[100];
00280 proximity_hit_t *ret;
00281
00282 HDecompress_getBodyAndConsole((char *) ctext, ctext_sz, &text_len,
00283 &compressed_body, &compressed_body_len,
00284 &console);
00285
00286 MyHashtable_init(&tokens, CGREP_HT_INITSIZE);
00287 bzero(filter, 256);
00288 bzero(sepFilter, 256);
00289
00290 #ifdef DEBUG
00291 printf("TEST: dc: %d, dl: %d\n", strlen(console.dictionary.content),
00292 console.dictionary.length);
00293 #endif
00294
00295
00296 if ( (newline_node = HHashtable_search("\n", 1, &(console.hashtable))) )
00297 HToken_RankFromTaggedCw(newline_node->codeword, newline_node->cw_len,
00298 &(console.canoinfo),
00299 &correction);
00300 else
00301 correction = -1;
00302
00303 CGrep_WriteDictToFile(dFileName, 100, &(console.dictionary));
00304
00305
00306 for ( npatterns = 0; patterns[npatterns]; npatterns++ ) {
00307 i = 0;
00308
00309 agrepOptions[i++] = CGREP_AGREP_EXECUTABLE;
00310
00311
00312 j = 0;
00313 while ( options[npatterns] && options[npatterns][j] &&
00314 (i < CGREP_MAX_AGREPOPTIONS) )
00315 agrepOptions[i++] = options[npatterns][j++];
00316
00317 agrepOptions[i++] = patterns[npatterns];
00318 agrepOptions[i] = NULL;
00319
00320
00321 ntokens = CGrep_GetMatchingCWFromFile(dFileName, &tokens, filter,
00322 agrepOptions, npatterns, &console);
00323 if ( !ntokens ) {
00324 *nres = 0;
00325 return(NULL);
00326 }
00327 }
00328
00329
00330 MyHashtable_init(&sepTable, 10*CGREP_HT_INITSIZE);
00331 nseparators = CGrep_GetSeparators(dFileName, &sepTable, sepFilter, &console);
00332
00333
00334 unlink(dFileName);
00335
00336
00337
00338
00339
00340
00341 if ( *ctext )
00342 ret = CGrep_GetOccurrencesProximitySpaceless(nres, prox_window, npatterns,
00343 &tokens, filter, &sepTable,
00344 sepFilter,
00345 compressed_body,
00346 compressed_body_len);
00347 else
00348
00349 ret = CGrep_GetOccurrencesProximity(nres, prox_window,
00350 npatterns, &tokens,
00351 filter, &sepTable,
00352 sepFilter,
00353 compressed_body,
00354 compressed_body_len,
00355 newline_node);
00356
00357 MyHashtable_clear(&tokens);
00358 MyHashtable_clear(&sepTable);
00359
00360 return(ret);
00361 }
00362
00363
00364
00365 void
00366 CGrep_WriteDictToFile(char *fname, int len, const Dictionary *d)
00367 {
00368 char pids[100];
00369 int attempt = 0;
00370 int tmpfd;
00371 int written = 0;
00372
00373 if ( len < 40 ) {
00374 fputs("Too small a buffer top hold the filename.", stderr);
00375 exit(2);
00376 }
00377
00378 sprintf(pids, "%d", getpid());
00379
00380 do
00381 sprintf(fname, "/tmp/agrep.tmp.%s.%d", pids, ++attempt);
00382 while ( ((tmpfd = open(fname, O_RDWR|O_CREAT|O_EXCL,
00383 S_IREAD|S_IWUSR)) == -1) &&
00384 (errno == EEXIST) );
00385
00386 if ( tmpfd == -1 ) {
00387 perror("open");
00388 exit(2);
00389 }
00390
00391 do {
00392 written = write(tmpfd, d->content + written, d->length - written);
00393 if ( written == -1 ) {
00394 perror("write");
00395 exit(2);
00396 }
00397 } while ( written < d->length );
00398 close(tmpfd);
00399 }
00400
00401 int
00402 CGrep_Lookup(const char*ifname, const char*ofname, char **options)
00403 {
00404 char cmd[10000];
00405 int i;
00406 int status;
00407
00408 strcpy(cmd, options[0]);
00409 strcat(cmd, " ");
00410 for (i = 1; options[i]; i++ ) {
00411 strcat(cmd, "'");
00412 strcat(cmd, options[i]);
00413 strcat(cmd, "' ");
00414 }
00415
00416 strcat(cmd, ifname);
00417 strcat(cmd, " > ");
00418 strcat(cmd, ofname);
00419
00420 #ifdef DEBUG
00421 puts(cmd);
00422 #endif
00423 status = system(cmd);
00424 if ( (status == -1) ) {
00425 perror("system");
00426 exit(2);
00427 }
00428 if ( !WIFEXITED(status) ) {
00429 fprintf(stderr, "command \"%s\" failed.\n", cmd);
00430 exit(2);
00431 }
00432 switch ( WEXITSTATUS(status) ) {
00433 case 0:
00434 break;
00435 case 1:
00436 break;
00437 default:
00438 fprintf(stderr, "system(\"%s\") exited with status %d\n",
00439 cmd, WEXITSTATUS(status));
00440 exit(2);
00441 }
00442
00443 return(0);
00444 }
00445
00446
00447 int CGrep_GetMatchingCWFromFile(const char *fname, MyHash_table *ht,
00448 char *filter, char **options,
00449 int npattern, const Console *c)
00450 {
00451 int i = 0;
00452 char pids[100] = "";
00453 FILE *istream;
00454 char obufs[1024];
00455 int attempt = 0;
00456 char *input_buf, *runner, *buf_end;
00457 size_t input_len;
00458 struct stat sbuf;
00459 int ntok = 0;
00460 int newLineSeen = 0;
00461
00462 sprintf(pids, "%d", getpid());
00463 sprintf(obufs, "/tmp/agrep.out.%s.%d", pids, attempt);
00464
00465 CGrep_Lookup(fname, obufs, options);
00466
00467 stat(obufs, &sbuf);
00468 if ( (istream = fopen(obufs, "r")) == NULL ) {
00469 perror("fopen");
00470 exit(2);
00471 }
00472 input_len = sbuf.st_size;
00473
00474 if ( input_len ) {
00475 input_buf = (char *) mmap(NULL,input_len,PROT_READ,MAP_SHARED,
00476 fileno(istream),0);
00477 if ( (input_buf == MAP_FAILED) ) {
00478 perror("mmap");
00479 exit(2);
00480 }
00481 if ( input_buf == NULL ) {
00482 fprintf(stderr, "mmap returned NULL (but the filesize was >0)?!\n");
00483 exit(2);
00484 }
00485
00486
00487 buf_end = input_buf + input_len - 1;
00488 runner = input_buf;
00489 ntok = 0;
00490 do {
00491 char *end = runner;
00492 Hash_node *t;
00493 int hashcw;
00494 while ( *end != '\n' ) end++;
00495 if ( (end == runner) && !newLineSeen ) {
00496 while ( *(++end) == '\n' );
00497 end -= 1;
00498 if ( end - runner != 1 ) {
00499 fprintf(stderr,
00500 "Error: messed up dictionary? (incoherent '\\n', line %d)\n",
00501 ntok);
00502 exit(2);
00503 }
00504
00505 } else if ( end == runner ) {
00506 fprintf(stderr,
00507 "Error: messed up dictionary? (incoherent '\\n', line %d)\n",
00508 ntok);
00509 exit(2);
00510 }
00511 ntok++;
00512 t = HHashtable_search(runner, (end - runner),
00513 (HHash_table *) &(c->hashtable));
00514 if ( !t ) {
00515 char wrongToken[end-runner+1];
00516 memcpy(wrongToken, runner, (end-runner));
00517 wrongToken[end-runner] = '\0';
00518 fprintf(stderr,
00519 "Error: messed up dictionary? (token \"%s\" not found)\n",
00520 wrongToken);
00521 exit(2);
00522 }
00523 #ifndef WORDS_BIGENDIAN
00524
00525 for ( i = 0; i < t->cw_len; i++ )
00526 *((char *) &hashcw + i) = *((char *) &t->codeword + t->cw_len - i - 1);
00527 #endif
00528
00529 if ( filter )
00530 filter[*((unsigned char *) &hashcw)] = 1;
00531
00532 MyHashtable_insert((char *) &hashcw, t->cw_len, npattern, ht
00533 #ifdef DEBUG
00534 , runner, (end - runner)
00535 #endif
00536 );
00537
00538 runner = end + 1;
00539 } while ( runner < buf_end );
00540 munmap(input_buf, input_len);
00541 }
00542 fclose(istream);
00543 #ifndef DEBUG
00544 unlink(obufs);
00545 #else
00546 printf("N. results: %d\n", ntok);
00547 #endif
00548
00549 return(ntok);
00550 }
00551
00579 int
00580 CGrep_GetMatchingCW(MyHash_table *ht, char *filter, char **options,
00581 int npattern, const Console *c)
00582 {
00583 char fname[100];
00584 int ret;
00585
00586 CGrep_WriteDictToFile(fname, 100, &(c->dictionary));
00587 ret = CGrep_GetMatchingCWFromFile(fname, ht, filter, options, npattern, c);
00588 unlink(fname);
00589 return(ret);
00590 }
00591
00606 int *
00607 CGrep_GetCWOccurrences(int *nocc, const MyHash_table *ht, const char *filter,
00608 const char *cbody, size_t cbody_len)
00609 {
00610 int *position;
00611 const char *prevpos, *currpos = cbody;
00612 int currentResSize = 0;
00613 const char *endpos = cbody+cbody_len;
00614 register char v;
00615
00616 currentResSize = CGREP_MAX(*nocc,CGREP_INIT_VECSIZE);
00617 position = (int *) malloc(currentResSize * sizeof(int));
00618
00619 *nocc = 0;
00620 while ( currpos < endpos ) {
00621 prevpos = currpos;
00622 v = *(*((unsigned char *) prevpos) + filter);
00623
00624 do {
00625 currpos++;
00626 } while ( (currpos<endpos) && ((*currpos & 0x80) == 0) );
00627
00628 if ( v ) {
00629 if ( MyHashtable_search(prevpos, currpos-prevpos, ht) ) {
00630
00631
00632 if ( *nocc >= currentResSize ) {
00633
00634 currentResSize += (int) sqrt(currentResSize) + 1;
00635 position = (int *) realloc(position, currentResSize * sizeof(int));
00636 }
00637
00638 position[(*nocc)++] = (int) (prevpos-cbody);
00639 }
00640 }
00641 }
00642 return(position);
00643 }
00644
00660 const char *
00661 CGrep_GetNextCWOccurrence(int *len, MyHash_table *ht, const char *filter,
00662 const char *cbody, size_t remaining)
00663 {
00664 const char *endpos = cbody+remaining;
00665 const char *prevpos;
00666 register char v;
00667
00668 while ( cbody < endpos ) {
00669 prevpos = cbody;
00670 v = *(*((unsigned char *) prevpos) + filter);
00671
00672 do {
00673 cbody++;
00674 } while ( (cbody<endpos) && ((*cbody & 0x80) == 0) );
00675
00676 if ( v ) {
00677 if ( MyHashtable_search(prevpos, cbody-prevpos, ht) ) {
00678
00679 *len = cbody-prevpos;
00680 return(prevpos);
00681 }
00682 }
00683 }
00684 return(NULL);
00685 }
00686
00687
00688
00689 #define CGREP_GET_NEXT_CW() { \
00690 prevpos = currpos; \
00691 do { \
00692 currpos++; \
00693 } while ( (currpos<endpos) && ((*currpos & 0x80) == 0) ); \
00694 cw_len = currpos - prevpos; \
00695 }
00696
00697
00698
00699
00700 #define CGREP_CHECKIFSEPARATOR(cw, cw_len, filter, ht) \
00701 ((filter[*((unsigned char *) cw)]) && MyHashtable_search(cw, cw_len, ht))
00702
00733 proximity_hit_t *
00734 CGrep_GetOccurrencesProximitySpaceless(int *nocc, int prox_window,
00735 int npatterns,
00736 const MyHash_table *ht,
00737 const char filter[],
00738 const MyHash_table *separators,
00739 const char sepFilter[],
00740 const char *cbody, size_t cbody_len)
00741 {
00742 proximity_hit_t *matches;
00743 word_hit_t patternHit[npatterns];
00744 int i, k;
00745 int patternIndexWithMinPosition;
00746 size_t minWPosition, minBytePosition;
00747 size_t currWPosition = 0;
00748 const char *prevpos, *currpos = cbody;
00749 int currentResSize = 0;
00750 const char *endpos = cbody+cbody_len;
00751 #ifdef DEBUG
00752 int nseps = 0;
00753 #endif
00754 int cw_len = 0;
00755 MyHash_node *hn;
00756
00757 currentResSize = CGREP_MAX(*nocc,CGREP_INIT_VECSIZE);
00758 matches = (proximity_hit_t *) malloc(currentResSize *
00759 sizeof(proximity_hit_t));
00760
00761 for (i = 0; i < npatterns; i++ ) {
00762 patternHit[i].byte_position = 0;
00763 patternHit[i].word_position = 0;
00764 }
00765
00766 *nocc = 0;
00767 patternIndexWithMinPosition = 0;
00768 minWPosition = 0;
00769 minBytePosition = 0;
00770
00771 while ( currpos < endpos ) {
00772 CGREP_GET_NEXT_CW();
00773
00774
00775 if ( sepFilter[*(unsigned char *) prevpos] ) {
00776 if ( MyHashtable_search(prevpos, cw_len, separators) ) {
00777
00778 #ifdef DEBUG
00779 nseps++;
00780 #endif
00781 continue;
00782 }
00783 }
00784
00785 currWPosition++;
00786
00787 hn = NULL;
00788 while ( (hn = CGrep_CheckIfIsPattern(prevpos, cw_len,
00789 &(ht[0]), hn, filter)) != NULL ) {
00790 i = hn->npattern;
00791
00792 #ifdef DEBUG
00793 printf("Found a hit for pattern %d, position %d\n", i, currWPosition);
00794 #endif
00795 patternHit[i].word_position = currWPosition;
00796 patternHit[i].byte_position = (int) (prevpos-cbody);
00797 if ( patternIndexWithMinPosition == i ) {
00798
00799 int j = 0;
00800
00801 minWPosition = currWPosition;
00802 for ( j = 0; j < npatterns; j++ ) {
00803 if ( minWPosition >= patternHit[j].word_position ) {
00804 patternIndexWithMinPosition = j;
00805 minWPosition = patternHit[j].word_position;
00806 minBytePosition = patternHit[j].byte_position;
00807 }
00808 }
00809 }
00810
00811
00812
00813 if ( (minWPosition > 0) &&
00814 ((currWPosition - minWPosition) <= (size_t) prox_window) ) {
00815
00816 #ifdef DEBUG
00817 printf("...and a proximity hit!\n");
00818 #endif
00819 if ( *nocc >= currentResSize ) {
00820
00821 currentResSize += (int) sqrt(currentResSize) + 1;
00822 matches = (proximity_hit_t *) realloc(matches,
00823 currentResSize *
00824 sizeof(proximity_hit_t));
00825 }
00826
00827 matches[*nocc].byte_position = minBytePosition;
00828 matches[*nocc].start_position = minWPosition;
00829 matches[*nocc].end_position = currWPosition;
00830 for ( k = 0; k < npatterns; k++ ) {
00831 matches[*nocc].positions[k] = patternHit[k].byte_position;
00832 matches[*nocc].ranks[k] = patternHit[k].word_position;
00833 }
00834 (*nocc)++;
00835 }
00836 }
00837 }
00838
00839
00840 if ( !(*nocc) ) {
00841 free(matches);
00842 matches = NULL;
00843 }
00844
00845 return(matches);
00846 }
00847
00848
00849
00850
00851 typedef enum state {FOUND_WORD, FOUND_NEWLINE, FOUND_SEPARATOR} state_t;
00852
00860 proximity_hit_t *
00861 CGrep_GetOccurrencesProximity(int *nocc, int prox_window,
00862 int npatterns, const MyHash_table *ht,
00863 const char filter[],
00864 const MyHash_table *separators,
00865 const char *sepFilter,
00866 const char *cbody, size_t cbody_len,
00867 const Hash_node *nl)
00868 {
00869 proximity_hit_t *matches;
00870 word_hit_t patternHit[npatterns];
00871 int i;
00872 int patternIndexWithMinPosition;
00873 size_t minWPosition, minBytePosition;
00874 size_t currWPosition = 0;
00875 const char *prevpos, *currpos = cbody;
00876 int currentResSize = 0;
00877 const char *endpos = cbody+cbody_len;
00878 int cw_len = 0;
00879 int newline_cw = 0;
00880 int newline_cw_len;
00881 state_t state;
00882 MyHash_node *hn;
00883 int k;
00884
00885 if ( !nl )
00886 newline_cw_len = 0;
00887 else {
00888 newline_cw = nl->codeword;
00889 newline_cw_len = nl->cw_len;
00890 }
00891
00892 currentResSize = CGREP_MAX(*nocc,CGREP_INIT_VECSIZE);
00893 matches = (proximity_hit_t *) malloc(currentResSize *
00894 sizeof(proximity_hit_t));
00895
00896 for (i = 0; i < npatterns; i++ ) {
00897 patternHit[i].byte_position = 0;
00898 patternHit[i].word_position = 0;
00899 }
00900
00901 *nocc = 0;
00902 patternIndexWithMinPosition = 0;
00903 minWPosition = 0;
00904 minBytePosition = 0;
00905
00906 CGREP_GET_NEXT_CW();
00907
00908 state = FOUND_NEWLINE;
00909
00910 do {
00911 switch ( state ) {
00912
00913 case FOUND_SEPARATOR:
00914
00915 if ( CGREP_CWMATCH(prevpos, cw_len, &newline_cw, newline_cw_len) ) {
00916 state = FOUND_NEWLINE;
00917 CGREP_GET_NEXT_CW();
00918 break;
00919 }
00920 state=FOUND_WORD;
00921
00922
00923 case FOUND_WORD:
00924 currWPosition++;
00925
00926 hn = NULL;
00927
00928 while ( (hn = CGrep_CheckIfIsPattern(prevpos, cw_len,
00929 ht, hn, filter)) != NULL ) {
00930 i = hn->npattern;
00931 patternHit[i].word_position = currWPosition;
00932 patternHit[i].byte_position = (int) (prevpos-cbody);
00933 if ( patternIndexWithMinPosition == i ) {
00934
00935 int j = 0;
00936
00937 minWPosition = currWPosition;
00938 for ( j = 0; j < npatterns; j++ ) {
00939 if ( minWPosition >= patternHit[j].word_position ) {
00940 patternIndexWithMinPosition = j;
00941 minWPosition = patternHit[j].word_position;
00942 minBytePosition = patternHit[j].byte_position;
00943 }
00944 }
00945 }
00946
00947
00948
00949 if ( (minWPosition > 0) &&
00950 ((currWPosition - minWPosition) <= (size_t) prox_window) ) {
00951
00952 if ( *nocc >= currentResSize ) {
00953
00954 currentResSize += (int) sqrt(currentResSize) + 1;
00955 matches = (proximity_hit_t *) realloc(matches,
00956 currentResSize *
00957 sizeof(proximity_hit_t));
00958 }
00959
00960 matches[*nocc].byte_position = minBytePosition;
00961 matches[*nocc].start_position = minWPosition;
00962 matches[*nocc].end_position = currWPosition;
00963 for ( k = 0; k < npatterns; k++ ) {
00964 matches[*nocc].positions[k] = patternHit[k].byte_position;
00965 matches[*nocc].ranks[k] = patternHit[k].word_position;
00966 }
00967 (*nocc)++;
00968 }
00969 }
00970
00971 CGREP_GET_NEXT_CW();
00972 if ( CGREP_CWMATCH(prevpos, cw_len, &newline_cw, newline_cw_len) ) {
00973 state = FOUND_NEWLINE;
00974 CGREP_GET_NEXT_CW();
00975 break;
00976 }
00977 state = FOUND_SEPARATOR;
00978 CGREP_GET_NEXT_CW();
00979 break;
00980
00981 case FOUND_NEWLINE:
00982
00983 if ( CGREP_CWMATCH(prevpos, cw_len, &newline_cw, newline_cw_len) ) {
00984 state = FOUND_NEWLINE;
00985 CGREP_GET_NEXT_CW();
00986 break;
00987 }
00988
00989 if ( CGREP_CHECKIFSEPARATOR(prevpos, cw_len, sepFilter, separators) ) {
00990 state=FOUND_SEPARATOR;
00991 CGREP_GET_NEXT_CW();
00992 break;
00993 }
00994
00995 state=FOUND_WORD;
00996 break;
00997 }
00998 } while ( (currpos < endpos) );
00999
01000 if ( state == FOUND_WORD ) {
01001
01002 }
01003
01004
01005
01006 if ( !(*nocc) ) {
01007 free(matches);
01008 matches = NULL;
01009 }
01010
01011 return(matches);
01012 }
01013
01032 MyHash_node *
01033 CGrep_CheckIfIsPattern(const char *cbody, int cw_len, const MyHash_table *ht,
01034 MyHash_node *hn, const char filter[])
01035 {
01036 if ( !hn ) {
01037 if ( filter[*(unsigned char *) cbody] ) {
01038
01039 hn = ht->table[MyHashtable_func(cbody,cw_len,ht)];
01040 while ( hn &&
01041 !CGREP_CWMATCH(cbody, cw_len, hn->str, hn->len_str) )
01042 hn = hn->next;
01043 } else
01044 return(NULL);
01045 } else {
01046 do {
01047 hn = hn->next;
01048 } while ( hn &&
01049 !CGREP_CWMATCH(cbody, cw_len, hn->str, hn->len_str) );
01050 }
01051 return(hn);
01052 }
01053
01068 char *CGrep_escapeStringConfigurable(const char *s, size_t len, char min,
01069 char max, const char *exceptions)
01070 {
01071 int sz = 5 * len + 1;
01072 char *ret = (char *) malloc(sz);
01073 const char *end = s+len;
01074 const char *bgn = ret;
01075 char *curr = ret;
01076 if ( !ret ) {
01077 perror("malloc");
01078 exit(2);
01079 }
01080 while ( s < end ) {
01081 if ( (curr - bgn) >= (sz-5) ) {
01082 sz = (sz * 0.5) + 5;
01083 ret = realloc(ret, sz);
01084 if ( !ret ) {
01085 perror("realloc");
01086 exit(2);
01087 }
01088 curr = ret + (curr - bgn);
01089 bgn = ret;
01090 }
01091 if ( (*s < min) || (*s > max) || strchr(exceptions, *s) ) {
01092 sprintf(curr, "[\\%x]", *s);
01093 while ( *curr ) curr++;
01094 } else
01095 *curr++ = *s;
01096 s++;
01097 }
01098 *curr = 0;
01099 return(ret);
01100 }
01101
01106 char *CGrep_escapeString(const char *s, size_t len)
01107 {
01108 return(CGrep_escapeStringConfigurable(s, len, CGREP_MIN_PRINTABLE_CHAR,
01109 CGREP_MAX_PRINTABLE_CHAR,
01110 CGREP_NONPRINTABLE_CHARS));
01111 }
01112
01113
01114
01115
01116
01117
01124 void MyHashtable_init(MyHash_table *ht, int n)
01125 {
01126 int i;
01127 ht->size = n * 10;
01128 ht->table = (MyHash_nodeptr_array) malloc(ht->size * sizeof(MyHash_node*));
01129 ht->card = 0;
01130 if (ht->table == NULL) {
01131 fprintf(stderr,"Fatal Error: MyHash table allocation\n");
01132 exit(2);
01133 }
01134
01135
01136
01137 for ( i = 0; i < ht->size; ht->table[i++] = NULL );
01138 }
01139
01145 void MyHashtable_clear(MyHash_table *ht)
01146 {
01147 int i;
01148
01149 for ( i = 0; i < ht->size; i++ ) {
01150 MyHash_node *hn = ht->table[i];
01151 while ( hn ) {
01152 MyHash_node *toFree = hn;
01153 hn = hn->next;
01154 free(toFree);
01155 }
01156 }
01157 ht->card = 0;
01158 ht->size = 0;
01159 free(ht->table);
01160 }
01161
01167 int MyHashtable_func(const char *s, int len, const MyHash_table *ht)
01168 {
01169 register int hfn;
01170 int hfi;
01171 int table_size = ht->size;
01172
01173 hfn = 11;
01174 for (hfi=0; hfi<len ; hfi++)
01175 hfn = hfn ^ ((hfn<<5) + (hfn>>2) + (unsigned char) *s++);
01176 hfn = abs(hfn % table_size);
01177 return(hfn);
01178 }
01179
01186 MyHash_node *MyHashtable_search(const char *s, int slen,
01187 const MyHash_table *ht)
01188 {
01189 MyHash_node *ghsp;
01190
01191 ghsp = ht->table[MyHashtable_func(s,slen,ht)];
01192
01193 while ( ghsp ) {
01194 if ( CGREP_CWMATCH(s, slen, ghsp->str, ghsp->len_str) )
01195 return(ghsp);
01196 ghsp = ghsp->next;
01197 }
01198 return(NULL);
01199 }
01200
01211 int MyHashtable_insert(const char *s, int slen, int npattern, MyHash_table *ht
01212 #ifdef DEBUG
01213 , char *word, int len
01214 #endif
01215 )
01216
01217 {
01218 int hiv;
01219 int i;
01220 MyHash_node *hip;
01221 MyHash_node *table_head;
01222 MyHash_node *hip_pred;
01223
01224
01225 hiv = MyHashtable_func(s,slen,ht);
01226 hip = MyHashtable_search(s,slen,ht);
01227
01228 if (hip) {
01229
01230
01231 if (ht->table[hiv] != hip){
01232
01233
01234
01235 for (hip_pred = ht->table[hiv]; hip_pred->next != hip;
01236 hip_pred = hip_pred->next) ;
01237
01238 hip_pred->next = hip->next;
01239 table_head = ht->table[hiv];
01240 ht->table[hiv] = hip;
01241 hip->next = table_head;
01242 }
01243
01244 return(0);
01245
01246 } else {
01247 hip = (MyHash_node *) malloc(sizeof(MyHash_node));
01248 if (hip == NULL) {
01249 fprintf(stderr,"Error: Insert hash table\n");
01250 exit(2);
01251 }
01252
01253 hip->len_str = slen;
01254 for ( i = 0; i < slen; i++ )
01255 hip->str[i] = s[i];
01256 hip->npattern = npattern;
01257 hip->next = ht->table[hiv];
01258 #ifdef DEBUG
01259 strncpy(hip->word, word, len);
01260 hip->word[len] = 0;
01261 #endif
01262 ht->table[hiv] = hip;
01263 ht->card += 1;
01264 return(1);
01265 }
01266 }
01267