Main Page   Alphabetical List   Compound List   File List   Compound Members   File Members  

cgrep.c

Go to the documentation of this file.
00001 
00015 #include "CGrepLib.h"
00016 #include <fcntl.h>
00017 
00018 /* option values, global */
00019 int count = 0;
00020 int window = 5;
00021 int escape = 0;
00022 int proximity = 0;
00023 int prox_window = 0;
00024 int lookup = 0;
00025 int highlight = 0;
00026 char *preHighlight = NULL;
00027 char *postHighlight = NULL;
00028 char *separator = NULL;
00029 /* option values end */
00030 
00031 
00032 #define DECOMPRESS_PREVIOUS(a, b, c, d, e, f, g, h) \
00033   if ( a ) \
00034      HDecompress_previousBlock_tokens_spaceless(b, c, d, e, f, g, h); \
00035   else \
00036      HDecompress_previousBlock_tokens(b, c, d, e, f, g, h); \
00037 
00038 #define DECOMPRESS_NEXT(a, b, c, d, e, f, g, h) \
00039   if ( a ) \
00040      HDecompress_nextBlock_tokens_spaceless(b, c, d, e, f, g, h); \
00041   else \
00042      HDecompress_nextBlock_tokens(b, c, d, e, f, g, h); \
00043 
00044 
00045 void usage(char *exname)
00046 {
00047   printf("usage: %s [cgrep options] [--] [[agrep options] pattern]+ fname\n",
00048          exname);
00049   puts("Search within filename (huffword-compressed file) for matches.");
00050   puts("CGREP OPTIONS:");
00051   puts("  -b  : highlight matching words in bold");
00052   puts("  -c  : count matches");
00053   puts("  -l  : lookup in the dictionary only (cannot be used with -w or -p)");
00054   puts("  -m pre post: prepend 'pre' and append 'post' to the matching words");
00055   puts("  -p n: search patterns within a proximity window of width n (n>0)");
00056   puts("  -s sep: output in format: match1\\nsep\\nmatch2... (not with -l)");
00057   puts("  -w n: return n words on left/right of the match (default 5, n>=0)");
00058   puts("  -x  : escape non-printable chars");
00059   puts("See 'man agrep' for agrep options.");
00060   puts("Options -n, -c, -s, -t, -G, -l to agrep are NOT allowed.");
00061 }
00062 
00063 char *
00064 mapFile(size_t *input_sz, const char *fname, int canBeEmpty)
00065 {
00066   struct stat sbuf;
00067   char *input_buf;
00068   FILE *input_file;
00069 
00070   stat(fname,&sbuf);
00071   if ( (input_file = fopen(fname, "r")) == NULL ) {
00072     fprintf(stderr, "cannot open %s, exiting.\n", fname);
00073     exit(2);
00074   }
00075   *input_sz = sbuf.st_size;
00076   
00077   if ( !canBeEmpty && (input_sz == 0) ) {
00078     fprintf(stderr,"Fatal Error: Input file empty\n");
00079     exit(2); 
00080   }
00081 
00082   input_buf = (char *) mmap(NULL,*input_sz,PROT_READ,MAP_SHARED,
00083                             fileno(input_file),0);
00084   if ( input_buf == MAP_FAILED ) {
00085     perror("mmap");
00086     exit(2);
00087   }
00088   fclose(input_file);
00089   return(input_buf);
00090 }
00091 
00092 int
00093 parseCGrepOptions(int noptions, char **options)
00094 {
00095   int i = 1;
00096   for (; i < noptions; i++) {
00097     if ( *options[i] != '-' ) // not an option
00098       return(i);
00099     switch (options[i][1]) {
00100     case 'c':
00101       count = 1; break;
00102     case 'b':
00103       highlight = 1;
00104       preHighlight = "";
00105       postHighlight = "";
00106       break;
00107     case 'm':
00108       highlight = 1;
00109       if ( (options[i+1] && options[i+2]) && (noptions > i+3) ) {
00110         preHighlight = options[i+1];
00111         postHighlight = options[i+2];
00112         i += 2;
00113       } else {
00114         usage(options[0]);
00115         fprintf(stderr, "Error: '-m' requires two arguments\n");
00116         exit(2);
00117       }
00118       break;
00119     case 'x':
00120       escape = 1; break;
00121     case 'w':
00122       if ( options[i][2] )
00123         window = atoi(options[i]+2);
00124       else if ( i < noptions - 1 )
00125         window = atoi(options[++i]);
00126       else {
00127         usage(options[0]);
00128         fprintf(stderr, "Error: '-w' requires an argument\n");
00129         exit(2);
00130       }
00131       if ( window < 0 ) {
00132         usage(options[0]);
00133         fprintf(stderr, "Error: invalid value for '-w' (%d)\n", window);
00134         exit(2);
00135       }
00136       break;
00137     case 'l':
00138       lookup = 1;
00139       if ( proximity ) {
00140         usage(options[0]);
00141         fprintf(stderr, "You cannot perform a lookup by proximity\n");
00142         exit(2);
00143       }
00144       break;
00145     case 's':
00146       separator = options[++i];
00147       break;
00148     case 'p':
00149       proximity = 1;
00150       if ( lookup ) {
00151         usage(options[0]);
00152         fprintf(stderr, "You cannot perform a lookup by proximity\n");
00153         exit(2);
00154       }
00155       if ( options[i][2] )
00156         prox_window = atoi(options[i]+2);
00157       else if ( i < noptions - 1 )
00158         prox_window = atoi(options[++i]);
00159       else {
00160         usage(options[0]);
00161         fprintf(stderr, "Error: '-p' requires the width of the proximity "
00162                 "window\n");
00163         exit(2);
00164       }
00165       if ( !prox_window ) {
00166         usage(options[0]);
00167         fprintf(stderr, "Error: invalid width (0) for the proximity window\n");
00168         exit(2);
00169       }
00170       break;
00171     case '-':
00172       if ( !options[i][2] )
00173         return(i+1);
00174     default:
00175       usage(options[0]);
00176       fprintf(stderr, "Error: unrecognized option %s\n", options[i]);
00177       exit(2);
00178     }
00179   }
00180   usage(options[0]);
00181   exit(2);
00182   return(0); // just to avoid compiler warning
00183 }
00184 
00185 void CGrep_Lookup(const char *inf, const char *outf, char **options);
00186 
00187 int do_lookup(char *input_buf, size_t input_sz, char *pattern,
00188               char **options)
00189 {
00190   char *myOptions[CGREP_MAX_AGREPOPTIONS];
00191   int i, j;
00192   char ifname[100];
00193   char ofname[100];
00194   char pids[10];
00195   char *buf;
00196   int len;
00197   int written = 0;
00198   int total = 0;
00199   int attempt = 0;
00200   int tmpfd;
00201   char *body;
00202   int blen;
00203   int tlen;
00204   Console console;
00205   Dictionary *d;
00206 
00207   sprintf(pids, "%d", getpid());
00208 
00209   HDecompress_getBodyAndConsole(input_buf, input_sz, &tlen, &body, &blen,
00210                                 &console);
00211   
00212   d = &(console.dictionary);
00213 
00214   do
00215     sprintf(ifname, "/tmp/agrep.dict.%s.%d", pids, ++attempt);
00216   while ( ((tmpfd = open(ifname, O_RDWR|O_CREAT|O_EXCL,
00217                          S_IREAD|S_IWUSR)) == -1) &&
00218           (errno == EEXIST) );
00219     
00220   if ( tmpfd == -1 ) {
00221     perror("open");
00222     exit(2);
00223   }
00224 
00225   do {
00226     written =  write(tmpfd, d->content + written, d->length - written);
00227     if ( written == -1 ) {
00228       perror("write");
00229       exit(2);
00230     }
00231   } while ( written < d->length );
00232   close(tmpfd);
00233 
00234 
00235   sprintf(ofname, "/tmp/agrep.match.%s.%d", pids, attempt);
00236 
00237   myOptions[0] = CGREP_AGREP_EXECUTABLE;
00238   i = 1; j = 0;
00239 
00240   for ( ; options[j] && i < CGREP_MAX_AGREPOPTIONS; )
00241     myOptions[i++] = options[j++];
00242   myOptions[i++] = pattern;
00243   myOptions[i] = NULL;
00244 
00245   CGrep_Lookup(ifname, ofname, myOptions);
00246 
00247   unlink(ifname);
00248 
00249   buf = mapFile(&len, ofname, 1);
00250 
00251   written = 0;
00252   
00253   if ( count ) {
00254     if ( !buf ) puts("0");
00255     else {
00256       char *run = buf;
00257       int cnt = 0;
00258       while ( run < buf+len ) {
00259         if ( *run == '\n' ) cnt++;
00260         if ( (run > buf) && (run < buf+len-1) &&
00261              ((*run) == '\n') && (*(run-1) == '\n') &&
00262              (*(run+1) == '\n') ) cnt--;
00263         run++;
00264       }
00265       printf("%d\n", cnt);
00266     }
00267   } else if ( buf )
00268     while ( (total + (written = write(1, buf+total, len-total))) < len )
00269       total += written;
00270 
00271   if ( buf ) {
00272     munmap(buf, len);
00273     unlink(ofname);
00274     return(0);
00275   }
00276   unlink(ofname);
00277   return(1);
00278 }
00279 
00280 int isMatchingToken(int tr, proximity_hit_t ph, int npatterns)
00281 {
00282   int i = 0;
00283   for ( ; i < npatterns; i++ )
00284     if ( ph.ranks[i] == tr ) return 1;
00285   return 0;
00286 }
00287 
00288 int main(int argc, char **argv)
00289 {
00290   size_t input_sz;
00291   char *input_buf;
00292   int beginningAgrepOptions = -1;
00293   char ***options = (char ***) malloc(sizeof(char **) * argc);
00294   char **patterns = (char **) malloc(sizeof(char *) * argc);
00295   int i = 0;
00296   int npatterns;
00297   int *positions;
00298   int nresults = 0;
00299   proximity_hit_t *hits;
00300 
00301   if ( argc < 3 ) { /* at least a pattern and a file */
00302     usage(argv[0]);
00303     exit(2);
00304   }
00305 
00306   input_buf = mapFile(&input_sz, argv[argc-1], 0);
00307 
00308   beginningAgrepOptions = parseCGrepOptions(argc-1, argv);
00309 
00310   npatterns = 0;
00311   do {
00312     i = 0;
00313     options[npatterns] = (char **) malloc(sizeof(char *) * argc);
00314     while ( (*argv[beginningAgrepOptions] == '-') &&
00315             (beginningAgrepOptions < argc - 1) ) {
00316       if ( !strcmp("-n", argv[beginningAgrepOptions]) ||
00317            !strcmp("-c", argv[beginningAgrepOptions]) ) {
00318         usage(argv[0]);
00319         fprintf(stderr,
00320                 "Error: option '%s' to agrep is not allowed\n",
00321                 argv[beginningAgrepOptions]);
00322         exit(2);
00323       }
00324       options[npatterns][i++] = argv[beginningAgrepOptions++];
00325     }
00326     options[npatterns][i] = NULL;
00327     
00328     while ( (*argv[beginningAgrepOptions] != '-') &&
00329             beginningAgrepOptions < argc - 1 ) {
00330       patterns[npatterns++] = argv[beginningAgrepOptions++];
00331       options[npatterns] = NULL;
00332     }
00333   } while ( beginningAgrepOptions < argc - 1 );
00334 
00335   patterns[npatterns] = NULL;
00336   options[npatterns] = NULL;
00337   if ( !npatterns ) {
00338     usage(argv[0]);
00339     fprintf(stderr, "Error: missing pattern\n");
00340     exit(2);
00341   }
00342   if ( (npatterns > 1) && ( !proximity ) ) {
00343     usage(argv[0]);
00344     fprintf(stderr, "Error: multiple patterns allowed only"
00345             " for proximity search\n");
00346     exit(2);
00347   }
00348 
00349   if ( lookup ) {
00350     return(do_lookup(input_buf, input_sz, patterns[0], options[0]));
00351   }
00352 
00353   /* Check whether we are performing a proximity search */
00354   if ( proximity ) {
00355     hits = CGrep_SearchProximity(&nresults, input_buf, input_sz, prox_window,
00356                                  patterns, options);
00357     if ( count )
00358       printf("%d\n", nresults);
00359     else if ( nresults ) {
00360       char *body;
00361       int tlen, blen;
00362       int scanned;
00363       Console console;
00364       HDecompress_getBodyAndConsole(input_buf, input_sz, &tlen, &body, &blen,
00365                                     &console);
00366       for ( scanned = 0; scanned < nresults; scanned++ ) {
00367         int decoded, plen = 0;
00368         char *waste, *pstr;
00369         int tokens = 0;
00370         int currRank = hits[scanned].start_position;
00371         int currDispl = hits[scanned].byte_position;
00372         int needSpace = 0;
00373 
00374         if ( scanned && separator )
00375           printf("%s\n", separator);
00376 
00377         DECOMPRESS_PREVIOUS(*input_buf, body+hits[scanned].byte_position,
00378                             hits[scanned].start_position,
00379                             window, &waste, &plen, &decoded, &console);
00380         if ( plen ) {
00381           waste[plen] = '\0';
00382           if ( escape ) {
00383             pstr = CGrep_escapeString(waste, plen);
00384             free(waste);
00385           } else
00386             pstr = waste;
00387           printf(pstr);
00388           if ( *input_buf && isalnum(pstr[strlen(pstr)-1])) {
00389             // spaceless, add a space
00390             putchar(' ');
00391           }
00392           fflush(stdout); /* otherwise it's not safe to free */
00393           free(pstr);
00394         }
00395         while ( (unsigned int) tokens < hits[scanned].end_position -
00396                 hits[scanned].start_position + 1 ) {
00397           int isMatching = isMatchingToken(currRank, hits[scanned], npatterns);
00398           char *printed;
00399 
00400           DECOMPRESS_NEXT(*input_buf, body+currDispl, input_sz-currDispl, 1,
00401                           &waste, &plen, &decoded, &console);
00402           currDispl += decoded;
00403           tokens++;
00404           currRank++;
00405           if ( needSpace && plen && isalnum(waste[0]) )
00406             putchar(' ');
00407           if ( plen ) {
00408             waste[plen] = '\0';
00409             if ( escape ) {
00410               pstr = CGrep_escapeString(waste, plen);
00411               free(waste);
00412             } else
00413               pstr = waste;
00414             printed = pstr;
00415             while ( !isalnum(*printed) ) // must avoid possible separators...
00416               putchar(*(printed++));
00417             if ( isMatching && highlight && preHighlight )
00418               printf(preHighlight);
00419             printf(printed);
00420             needSpace = ( *input_buf && isalnum(pstr[strlen(pstr)-1]));
00421             fflush(stdout);
00422             free(pstr);
00423           }
00424           if ( isMatching && highlight && postHighlight )
00425             printf(postHighlight);
00426         }
00427 
00428         DECOMPRESS_NEXT(*input_buf, body+currDispl, input_sz-currDispl,
00429                         window, &waste, &plen, &decoded, &console);
00430         if ( needSpace && plen && isalnum(waste[0]) )
00431           putchar(' ');
00432         if ( plen ) {
00433           waste[plen] = '\0';
00434           if ( escape ) {
00435             pstr = CGrep_escapeString(waste, plen);
00436             free(waste);
00437           } else
00438             pstr = waste;
00439           printf(pstr);
00440           fflush(stdout);
00441           free(pstr);
00442         }
00443         puts("");
00444       }
00445       if ( hits )
00446         free(hits);
00447     }
00448   } else { /* "standard" search for one pattern */
00449     positions = CGrep_SearchPattern(&nresults, input_buf, input_sz,
00450                                     patterns[0], options[0]);
00451     if ( count ) {
00452       printf("%d\n", nresults);
00453     } else if ( nresults ) {
00454       Console console;
00455       char *body;
00456       int tlen, blen;
00457       int scanned;
00458       HDecompress_getBodyAndConsole(input_buf, input_sz, &tlen, &body, &blen,
00459                                     &console);
00460       for ( scanned = 0; scanned < nresults; scanned++ ) {
00461         int decoded, plen = 0;
00462         char *waste, *pstr;
00463         int addSpace = 0;
00464 
00465         if ( scanned && separator )
00466           printf("%s\n", separator);
00467 
00468         DECOMPRESS_PREVIOUS(*input_buf, body+positions[scanned],
00469                             positions[scanned], window,
00470                             &waste, &plen, &decoded, &console);
00471         if ( plen ) {
00472           waste[plen] = '\0';
00473           if ( escape ) {
00474             pstr = CGrep_escapeString(waste, plen);
00475             free(waste);
00476           } else
00477             pstr = waste;            
00478           printf(pstr);
00479           if ( *input_buf && isalnum(pstr[strlen(pstr)-1])) {
00480             // spaceless, add a space
00481             putchar(' ');
00482           }
00483           fflush(stdout); /* otherwise it's not safe to free */
00484           free(pstr);
00485         }
00486         if ( highlight && preHighlight )
00487           printf(preHighlight);
00488         /* now, decompress the matching token */
00489         DECOMPRESS_NEXT(*input_buf, body+positions[scanned],
00490                         input_sz - positions[scanned], 1,
00491                         &waste, &plen, &decoded, &console);
00492         if ( plen ) {
00493           waste[plen] = '\0';
00494           if ( escape ) {
00495             pstr = CGrep_escapeString(waste, plen);
00496             free(waste);
00497           } else
00498             pstr = waste;
00499           printf(pstr);
00500           if ( highlight && postHighlight )
00501             printf(postHighlight);
00502           if ( *input_buf && isalnum(pstr[strlen(pstr)-1])) {
00503             // spaceless, add a space
00504             addSpace = 1;
00505           }
00506           fflush(stdout);
00507           free(pstr);
00508         }
00509         DECOMPRESS_NEXT(*input_buf, body+positions[scanned]+decoded,
00510                         input_sz - positions[scanned] - decoded, window,
00511                         &waste, &plen, &decoded, &console);
00512         if ( plen ) {
00513           waste[plen] = '\0';
00514           if ( escape ) {
00515             pstr = CGrep_escapeString(waste, plen);
00516             free(waste);
00517           } else
00518             pstr = waste;
00519           if ( addSpace && *input_buf && isalnum(pstr[0])) {
00520             // spaceless, add a space
00521             putchar(' ');
00522           }
00523           printf(pstr);
00524           fflush(stdout);
00525           free(pstr);
00526         }
00527         puts("");
00528       }
00529     }
00530     if ( positions )
00531       free(positions);
00532   }
00533 
00534   return((nresults>0)?0:1);
00535 }

Generated on Mon Mar 31 14:44:31 2003 by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002