00001
00015 #include "CGrepLib.h"
00016 #include <fcntl.h>
00017
00018
00019 int count = 0;
00020 int window = 5;
00021 int escape = 0;
00022 int proximity = 0;
00023 int prox_window = 0;
00024 int lookup = 0;
00025 int highlight = 0;
00026 char *preHighlight = NULL;
00027 char *postHighlight = NULL;
00028 char *separator = NULL;
00029
00030
00031
00032 #define DECOMPRESS_PREVIOUS(a, b, c, d, e, f, g, h) \
00033 if ( a ) \
00034 HDecompress_previousBlock_tokens_spaceless(b, c, d, e, f, g, h); \
00035 else \
00036 HDecompress_previousBlock_tokens(b, c, d, e, f, g, h); \
00037
00038 #define DECOMPRESS_NEXT(a, b, c, d, e, f, g, h) \
00039 if ( a ) \
00040 HDecompress_nextBlock_tokens_spaceless(b, c, d, e, f, g, h); \
00041 else \
00042 HDecompress_nextBlock_tokens(b, c, d, e, f, g, h); \
00043
00044
00045 void usage(char *exname)
00046 {
00047 printf("usage: %s [cgrep options] [--] [[agrep options] pattern]+ fname\n",
00048 exname);
00049 puts("Search within filename (huffword-compressed file) for matches.");
00050 puts("CGREP OPTIONS:");
00051 puts(" -b : highlight matching words in bold");
00052 puts(" -c : count matches");
00053 puts(" -l : lookup in the dictionary only (cannot be used with -w or -p)");
00054 puts(" -m pre post: prepend 'pre' and append 'post' to the matching words");
00055 puts(" -p n: search patterns within a proximity window of width n (n>0)");
00056 puts(" -s sep: output in format: match1\\nsep\\nmatch2... (not with -l)");
00057 puts(" -w n: return n words on left/right of the match (default 5, n>=0)");
00058 puts(" -x : escape non-printable chars");
00059 puts("See 'man agrep' for agrep options.");
00060 puts("Options -n, -c, -s, -t, -G, -l to agrep are NOT allowed.");
00061 }
00062
00063 char *
00064 mapFile(size_t *input_sz, const char *fname, int canBeEmpty)
00065 {
00066 struct stat sbuf;
00067 char *input_buf;
00068 FILE *input_file;
00069
00070 stat(fname,&sbuf);
00071 if ( (input_file = fopen(fname, "r")) == NULL ) {
00072 fprintf(stderr, "cannot open %s, exiting.\n", fname);
00073 exit(2);
00074 }
00075 *input_sz = sbuf.st_size;
00076
00077 if ( !canBeEmpty && (input_sz == 0) ) {
00078 fprintf(stderr,"Fatal Error: Input file empty\n");
00079 exit(2);
00080 }
00081
00082 input_buf = (char *) mmap(NULL,*input_sz,PROT_READ,MAP_SHARED,
00083 fileno(input_file),0);
00084 if ( input_buf == MAP_FAILED ) {
00085 perror("mmap");
00086 exit(2);
00087 }
00088 fclose(input_file);
00089 return(input_buf);
00090 }
00091
00092 int
00093 parseCGrepOptions(int noptions, char **options)
00094 {
00095 int i = 1;
00096 for (; i < noptions; i++) {
00097 if ( *options[i] != '-' )
00098 return(i);
00099 switch (options[i][1]) {
00100 case 'c':
00101 count = 1; break;
00102 case 'b':
00103 highlight = 1;
00104 preHighlight = "[1m";
00105 postHighlight = "[0m";
00106 break;
00107 case 'm':
00108 highlight = 1;
00109 if ( (options[i+1] && options[i+2]) && (noptions > i+3) ) {
00110 preHighlight = options[i+1];
00111 postHighlight = options[i+2];
00112 i += 2;
00113 } else {
00114 usage(options[0]);
00115 fprintf(stderr, "Error: '-m' requires two arguments\n");
00116 exit(2);
00117 }
00118 break;
00119 case 'x':
00120 escape = 1; break;
00121 case 'w':
00122 if ( options[i][2] )
00123 window = atoi(options[i]+2);
00124 else if ( i < noptions - 1 )
00125 window = atoi(options[++i]);
00126 else {
00127 usage(options[0]);
00128 fprintf(stderr, "Error: '-w' requires an argument\n");
00129 exit(2);
00130 }
00131 if ( window < 0 ) {
00132 usage(options[0]);
00133 fprintf(stderr, "Error: invalid value for '-w' (%d)\n", window);
00134 exit(2);
00135 }
00136 break;
00137 case 'l':
00138 lookup = 1;
00139 if ( proximity ) {
00140 usage(options[0]);
00141 fprintf(stderr, "You cannot perform a lookup by proximity\n");
00142 exit(2);
00143 }
00144 break;
00145 case 's':
00146 separator = options[++i];
00147 break;
00148 case 'p':
00149 proximity = 1;
00150 if ( lookup ) {
00151 usage(options[0]);
00152 fprintf(stderr, "You cannot perform a lookup by proximity\n");
00153 exit(2);
00154 }
00155 if ( options[i][2] )
00156 prox_window = atoi(options[i]+2);
00157 else if ( i < noptions - 1 )
00158 prox_window = atoi(options[++i]);
00159 else {
00160 usage(options[0]);
00161 fprintf(stderr, "Error: '-p' requires the width of the proximity "
00162 "window\n");
00163 exit(2);
00164 }
00165 if ( !prox_window ) {
00166 usage(options[0]);
00167 fprintf(stderr, "Error: invalid width (0) for the proximity window\n");
00168 exit(2);
00169 }
00170 break;
00171 case '-':
00172 if ( !options[i][2] )
00173 return(i+1);
00174 default:
00175 usage(options[0]);
00176 fprintf(stderr, "Error: unrecognized option %s\n", options[i]);
00177 exit(2);
00178 }
00179 }
00180 usage(options[0]);
00181 exit(2);
00182 return(0);
00183 }
00184
00185 void CGrep_Lookup(const char *inf, const char *outf, char **options);
00186
00187 int do_lookup(char *input_buf, size_t input_sz, char *pattern,
00188 char **options)
00189 {
00190 char *myOptions[CGREP_MAX_AGREPOPTIONS];
00191 int i, j;
00192 char ifname[100];
00193 char ofname[100];
00194 char pids[10];
00195 char *buf;
00196 int len;
00197 int written = 0;
00198 int total = 0;
00199 int attempt = 0;
00200 int tmpfd;
00201 char *body;
00202 int blen;
00203 int tlen;
00204 Console console;
00205 Dictionary *d;
00206
00207 sprintf(pids, "%d", getpid());
00208
00209 HDecompress_getBodyAndConsole(input_buf, input_sz, &tlen, &body, &blen,
00210 &console);
00211
00212 d = &(console.dictionary);
00213
00214 do
00215 sprintf(ifname, "/tmp/agrep.dict.%s.%d", pids, ++attempt);
00216 while ( ((tmpfd = open(ifname, O_RDWR|O_CREAT|O_EXCL,
00217 S_IREAD|S_IWUSR)) == -1) &&
00218 (errno == EEXIST) );
00219
00220 if ( tmpfd == -1 ) {
00221 perror("open");
00222 exit(2);
00223 }
00224
00225 do {
00226 written = write(tmpfd, d->content + written, d->length - written);
00227 if ( written == -1 ) {
00228 perror("write");
00229 exit(2);
00230 }
00231 } while ( written < d->length );
00232 close(tmpfd);
00233
00234
00235 sprintf(ofname, "/tmp/agrep.match.%s.%d", pids, attempt);
00236
00237 myOptions[0] = CGREP_AGREP_EXECUTABLE;
00238 i = 1; j = 0;
00239
00240 for ( ; options[j] && i < CGREP_MAX_AGREPOPTIONS; )
00241 myOptions[i++] = options[j++];
00242 myOptions[i++] = pattern;
00243 myOptions[i] = NULL;
00244
00245 CGrep_Lookup(ifname, ofname, myOptions);
00246
00247 unlink(ifname);
00248
00249 buf = mapFile(&len, ofname, 1);
00250
00251 written = 0;
00252
00253 if ( count ) {
00254 if ( !buf ) puts("0");
00255 else {
00256 char *run = buf;
00257 int cnt = 0;
00258 while ( run < buf+len ) {
00259 if ( *run == '\n' ) cnt++;
00260 if ( (run > buf) && (run < buf+len-1) &&
00261 ((*run) == '\n') && (*(run-1) == '\n') &&
00262 (*(run+1) == '\n') ) cnt--;
00263 run++;
00264 }
00265 printf("%d\n", cnt);
00266 }
00267 } else if ( buf )
00268 while ( (total + (written = write(1, buf+total, len-total))) < len )
00269 total += written;
00270
00271 if ( buf ) {
00272 munmap(buf, len);
00273 unlink(ofname);
00274 return(0);
00275 }
00276 unlink(ofname);
00277 return(1);
00278 }
00279
00280 int isMatchingToken(int tr, proximity_hit_t ph, int npatterns)
00281 {
00282 int i = 0;
00283 for ( ; i < npatterns; i++ )
00284 if ( ph.ranks[i] == tr ) return 1;
00285 return 0;
00286 }
00287
00288 int main(int argc, char **argv)
00289 {
00290 size_t input_sz;
00291 char *input_buf;
00292 int beginningAgrepOptions = -1;
00293 char ***options = (char ***) malloc(sizeof(char **) * argc);
00294 char **patterns = (char **) malloc(sizeof(char *) * argc);
00295 int i = 0;
00296 int npatterns;
00297 int *positions;
00298 int nresults = 0;
00299 proximity_hit_t *hits;
00300
00301 if ( argc < 3 ) {
00302 usage(argv[0]);
00303 exit(2);
00304 }
00305
00306 input_buf = mapFile(&input_sz, argv[argc-1], 0);
00307
00308 beginningAgrepOptions = parseCGrepOptions(argc-1, argv);
00309
00310 npatterns = 0;
00311 do {
00312 i = 0;
00313 options[npatterns] = (char **) malloc(sizeof(char *) * argc);
00314 while ( (*argv[beginningAgrepOptions] == '-') &&
00315 (beginningAgrepOptions < argc - 1) ) {
00316 if ( !strcmp("-n", argv[beginningAgrepOptions]) ||
00317 !strcmp("-c", argv[beginningAgrepOptions]) ) {
00318 usage(argv[0]);
00319 fprintf(stderr,
00320 "Error: option '%s' to agrep is not allowed\n",
00321 argv[beginningAgrepOptions]);
00322 exit(2);
00323 }
00324 options[npatterns][i++] = argv[beginningAgrepOptions++];
00325 }
00326 options[npatterns][i] = NULL;
00327
00328 while ( (*argv[beginningAgrepOptions] != '-') &&
00329 beginningAgrepOptions < argc - 1 ) {
00330 patterns[npatterns++] = argv[beginningAgrepOptions++];
00331 options[npatterns] = NULL;
00332 }
00333 } while ( beginningAgrepOptions < argc - 1 );
00334
00335 patterns[npatterns] = NULL;
00336 options[npatterns] = NULL;
00337 if ( !npatterns ) {
00338 usage(argv[0]);
00339 fprintf(stderr, "Error: missing pattern\n");
00340 exit(2);
00341 }
00342 if ( (npatterns > 1) && ( !proximity ) ) {
00343 usage(argv[0]);
00344 fprintf(stderr, "Error: multiple patterns allowed only"
00345 " for proximity search\n");
00346 exit(2);
00347 }
00348
00349 if ( lookup ) {
00350 return(do_lookup(input_buf, input_sz, patterns[0], options[0]));
00351 }
00352
00353
00354 if ( proximity ) {
00355 hits = CGrep_SearchProximity(&nresults, input_buf, input_sz, prox_window,
00356 patterns, options);
00357 if ( count )
00358 printf("%d\n", nresults);
00359 else if ( nresults ) {
00360 char *body;
00361 int tlen, blen;
00362 int scanned;
00363 Console console;
00364 HDecompress_getBodyAndConsole(input_buf, input_sz, &tlen, &body, &blen,
00365 &console);
00366 for ( scanned = 0; scanned < nresults; scanned++ ) {
00367 int decoded, plen = 0;
00368 char *waste, *pstr;
00369 int tokens = 0;
00370 int currRank = hits[scanned].start_position;
00371 int currDispl = hits[scanned].byte_position;
00372 int needSpace = 0;
00373
00374 if ( scanned && separator )
00375 printf("%s\n", separator);
00376
00377 DECOMPRESS_PREVIOUS(*input_buf, body+hits[scanned].byte_position,
00378 hits[scanned].start_position,
00379 window, &waste, &plen, &decoded, &console);
00380 if ( plen ) {
00381 waste[plen] = '\0';
00382 if ( escape ) {
00383 pstr = CGrep_escapeString(waste, plen);
00384 free(waste);
00385 } else
00386 pstr = waste;
00387 printf(pstr);
00388 if ( *input_buf && isalnum(pstr[strlen(pstr)-1])) {
00389
00390 putchar(' ');
00391 }
00392 fflush(stdout);
00393 free(pstr);
00394 }
00395 while ( (unsigned int) tokens < hits[scanned].end_position -
00396 hits[scanned].start_position + 1 ) {
00397 int isMatching = isMatchingToken(currRank, hits[scanned], npatterns);
00398 char *printed;
00399
00400 DECOMPRESS_NEXT(*input_buf, body+currDispl, input_sz-currDispl, 1,
00401 &waste, &plen, &decoded, &console);
00402 currDispl += decoded;
00403 tokens++;
00404 currRank++;
00405 if ( needSpace && plen && isalnum(waste[0]) )
00406 putchar(' ');
00407 if ( plen ) {
00408 waste[plen] = '\0';
00409 if ( escape ) {
00410 pstr = CGrep_escapeString(waste, plen);
00411 free(waste);
00412 } else
00413 pstr = waste;
00414 printed = pstr;
00415 while ( !isalnum(*printed) )
00416 putchar(*(printed++));
00417 if ( isMatching && highlight && preHighlight )
00418 printf(preHighlight);
00419 printf(printed);
00420 needSpace = ( *input_buf && isalnum(pstr[strlen(pstr)-1]));
00421 fflush(stdout);
00422 free(pstr);
00423 }
00424 if ( isMatching && highlight && postHighlight )
00425 printf(postHighlight);
00426 }
00427
00428 DECOMPRESS_NEXT(*input_buf, body+currDispl, input_sz-currDispl,
00429 window, &waste, &plen, &decoded, &console);
00430 if ( needSpace && plen && isalnum(waste[0]) )
00431 putchar(' ');
00432 if ( plen ) {
00433 waste[plen] = '\0';
00434 if ( escape ) {
00435 pstr = CGrep_escapeString(waste, plen);
00436 free(waste);
00437 } else
00438 pstr = waste;
00439 printf(pstr);
00440 fflush(stdout);
00441 free(pstr);
00442 }
00443 puts("");
00444 }
00445 if ( hits )
00446 free(hits);
00447 }
00448 } else {
00449 positions = CGrep_SearchPattern(&nresults, input_buf, input_sz,
00450 patterns[0], options[0]);
00451 if ( count ) {
00452 printf("%d\n", nresults);
00453 } else if ( nresults ) {
00454 Console console;
00455 char *body;
00456 int tlen, blen;
00457 int scanned;
00458 HDecompress_getBodyAndConsole(input_buf, input_sz, &tlen, &body, &blen,
00459 &console);
00460 for ( scanned = 0; scanned < nresults; scanned++ ) {
00461 int decoded, plen = 0;
00462 char *waste, *pstr;
00463 int addSpace = 0;
00464
00465 if ( scanned && separator )
00466 printf("%s\n", separator);
00467
00468 DECOMPRESS_PREVIOUS(*input_buf, body+positions[scanned],
00469 positions[scanned], window,
00470 &waste, &plen, &decoded, &console);
00471 if ( plen ) {
00472 waste[plen] = '\0';
00473 if ( escape ) {
00474 pstr = CGrep_escapeString(waste, plen);
00475 free(waste);
00476 } else
00477 pstr = waste;
00478 printf(pstr);
00479 if ( *input_buf && isalnum(pstr[strlen(pstr)-1])) {
00480
00481 putchar(' ');
00482 }
00483 fflush(stdout);
00484 free(pstr);
00485 }
00486 if ( highlight && preHighlight )
00487 printf(preHighlight);
00488
00489 DECOMPRESS_NEXT(*input_buf, body+positions[scanned],
00490 input_sz - positions[scanned], 1,
00491 &waste, &plen, &decoded, &console);
00492 if ( plen ) {
00493 waste[plen] = '\0';
00494 if ( escape ) {
00495 pstr = CGrep_escapeString(waste, plen);
00496 free(waste);
00497 } else
00498 pstr = waste;
00499 printf(pstr);
00500 if ( highlight && postHighlight )
00501 printf(postHighlight);
00502 if ( *input_buf && isalnum(pstr[strlen(pstr)-1])) {
00503
00504 addSpace = 1;
00505 }
00506 fflush(stdout);
00507 free(pstr);
00508 }
00509 DECOMPRESS_NEXT(*input_buf, body+positions[scanned]+decoded,
00510 input_sz - positions[scanned] - decoded, window,
00511 &waste, &plen, &decoded, &console);
00512 if ( plen ) {
00513 waste[plen] = '\0';
00514 if ( escape ) {
00515 pstr = CGrep_escapeString(waste, plen);
00516 free(waste);
00517 } else
00518 pstr = waste;
00519 if ( addSpace && *input_buf && isalnum(pstr[0])) {
00520
00521 putchar(' ');
00522 }
00523 printf(pstr);
00524 fflush(stdout);
00525 free(pstr);
00526 }
00527 puts("");
00528 }
00529 }
00530 if ( positions )
00531 free(positions);
00532 }
00533
00534 return((nresults>0)?0:1);
00535 }