/* * sgrep version 1.0 * * Copyright 2009 Stephen C. Losen. Distributed under the terms * of the GNU General Public License (GPL) * * Sgrep (sorted grep) is a much faster alternative to traditional Unix * grep, but with significant restrictions. 1) All input files must * be sorted regular files. 2) The sort key must start at the beginning * of the line. 3) The search key matches only at the beginning of * the line. 4) No regular expression support. * * Sgrep uses a binary search algorithm, which is very fast, but * requires sorted input. Each iteration of the search eliminates * half of the remaining input. In other words, doubling the size * of the file adds just one iteration. * * Sgrep seeks to the center of the file and then reads characters * until it encounters a newline, which places the file pointer at * the start of the next line. Sgrep compares the search key with the * beginning of the line. If the key is greater than the line, then * the process repeats with the second half of the file. If less than, * then the process repeats with the first half of the file. If equal, * then the line matches, but it may not be the earliest match, so the * process repeats with the first half of the file. Eventually all * of the input is eliminated and sgrep finds either no matching line * or the first matching line. Sgrep outputs matching lines until it * encounters a non matching line. * * Usage: sgrep [ -i | -n ] [ -c ] [ -b ] [ -r ] key [ sorted_file ... ] * * If no input file is specified, then sgrep uses stdin. * * The -i flag uses case insensitive byte comparison. The file must * be sorted with "sort -f". * * The -n flag uses numeric comparison. The file must be sorted * with "sort -n". * * The -b flag causes sgrep to ignore white space at the beginning * of lines and at the beginning of the search key. The file must * be sorted with "sort -b" * * The -c flag outputs the number of matching lines instead of the * lines themselves. * * The -r flag specifies that the file is sorted in reverse * (descending) order using "sort -r". * * Author: Stephen C. Losen University of Virginia */ /* large file support */ #ifdef _AIX #define _LARGE_FILES #else #define _FILE_OFFSET_BITS 64 #endif #include <sys/stat.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <ctype.h> #include <stdio.h> /* We need different comparison functions for different sort orderings */ /* exact comparison */ static int cmp_exact(const char *key, FILE *fp) { const unsigned char *k = (unsigned char *) key; int c; while (*k != 0 && (c = getc(fp)) == *k) { k++; } return *k == 0 ? 0 : c == '\n' ? 1 : c == EOF ? -1 : *k - c; } /* case insensitive comparison */ static int cmp_case(const char *key, FILE *fp) { const unsigned char *k = (unsigned char *) key; int c; while (*k != 0 && tolower(c = getc(fp)) == tolower(*k)) { k++; } return *k == 0 ? 0 : c == '\n' ? 1 : c == EOF ? -1 : tolower(*k) - tolower(c); } /* exact comparison ignoring leading white space */ static int cmp_exact_white(const char *key, FILE *fp) { const unsigned char *k = (unsigned char *) key; int c; while (isspace(*k)) { k++; } while ((c = getc(fp)) != '\n' && isspace(c)) ; while (*k != 0 && c == *k) { k++; c = getc(fp); } return *k == 0 ? 0 : c == '\n' ? 1 : c == EOF ? -1 : *k - c; } /* case insensitive comparison ignoring leading white space */ static int cmp_case_white(const char *key, FILE *fp) { const unsigned char *k = (unsigned char *) key; int c; while (isspace(*k)) { k++; } while ((c = getc(fp)) != '\n' && isspace(c)) ; while (*k != 0 && tolower(c) == tolower(*k)) { k++; c = getc(fp); } return *k == 0 ? 0 : c == '\n' ? 1 : c == EOF ? -1 : tolower(*k) - tolower(c); } /* numeric comparison */ static int cmp_num(const char *key, FILE *fp) { int c, i = 0; char buf[128], *cp = 0; double low, high, x; /* read numeric string into buf */ while((c = getc(fp)) != '\n' && c != EOF) { if (i == 0 && isspace(c)) { continue; } if (i + 1 >= sizeof(buf) || (c != '-' && c != '.' && !isdigit(c))) { break; } buf[i++] = c; } buf[i] = 0; if (c == EOF && i == 0) { return -1; } /* convert to double and use numeric comparison */ x = strtod(buf, 0); low = high = strtod(key, &cp); if (*cp == ':') { high = strtod(cp + 1, 0); } return high < x ? -1 : low > x ? 1 : 0; } static int (*compare)(const char *key, FILE *fp); /* * Use binary search to find the first matching line and return * its byte position. */ static off_t binsrch(const char *key, FILE *fp, int reverse) { off_t low, med, high, start, prev = -1, ret = -1; int cmp, c; struct stat st; fstat(fileno(fp), &st); high = st.st_size - 1; low = 0; while (low <= high) { med = (high + low) / 2; fseeko(fp, med, SEEK_SET); /* scan to start of next line if not at beginning of file */ if ((start = med) != 0) { do { start++; } while ((c = getc(fp)) != '\n' && c != EOF); } /* compare key with current line */ if (start != prev) { /* avoid unnecessary compares */ cmp = compare(key, fp); if (reverse != 0) { cmp = -cmp; } prev = start; } /* eliminate half of input */ if (cmp < 0) { high = med - 1; } else if (cmp > 0) { low = start + 1; } else { /* success, look for earlier match */ ret = start; high = med - 1; } } return ret; } /* print all lines that match the key or else just the number of matches */ static void printmatch(const char *key, FILE *fp, off_t start, const char *fname, int cflag) { int c, count; if (start >= 0) { fseeko(fp, start, SEEK_SET); } for (count = 0; start >= 0 && compare(key, fp) == 0; count++) { fseeko(fp, start, SEEK_SET); if (cflag == 0 && fname != 0) { fputs(fname, stdout); fputc(':', stdout); } while ((c = getc(fp)) != EOF) { start++; if (cflag == 0) { fputc(c, stdout); } if (c == '\n') { break; } } if (c == EOF) { break; } } if (cflag != 0) { if (fname != 0) { fputs(fname, stdout); fputc(':', stdout); } printf("%d\n", count); } } int main(int argc, char **argv) { FILE *fp; const char *key = 0; int i, numfile, status; int bflag = 0, cflag = 0, iflag = 0, nflag = 0, rflag = 0; off_t where; struct stat st; extern int optind, opterr; /* parse command line options */ opterr = 0; while ((i = getopt(argc, argv, "bcfinr")) > 0 && i != '?') { switch(i) { case 'b': bflag++; break; case 'c': cflag++; break; case 'f': case 'i': iflag++; nflag = 0; break; case 'n': nflag++; iflag = 0; break; case 'r': rflag++; break; } } if (i == '?' || optind >= argc) { fputs ("Usage: sgrep [ -i | -n ] [ -c ] [ -b ] [ -r ] key " "[ sorted_file ... ]\n", stderr); exit(2); } i = optind; key = argv[i++]; /* select the comparison function */ if (iflag != 0) { compare = bflag == 0 ? cmp_case : cmp_case_white; } else if (nflag != 0) { compare = cmp_num; } else { compare = bflag == 0 ? cmp_exact : cmp_exact_white; } /* if no input files, then search stdin */ if ((numfile = argc - i) == 0) { fstat(fileno(stdin), &st); if ((st.st_mode & S_IFREG) == 0) { fputs("sgrep: STDIN is not a regular file\n", stderr); exit(2); } where = binsrch(key, stdin, rflag); printmatch(key, stdin, where, 0, cflag); exit(where < 0); } /* search each input file */ for (status = 1; i < argc; i++) { if ((fp = fopen(argv[i], "r")) == 0) { fprintf(stderr, "sgrep: could not open %s\n", argv[i]); status = 2; continue; } fstat(fileno(fp), &st); if ((st.st_mode & S_IFREG) == 0) { fprintf(stderr, "sgrep: %s is not a regular file\n", argv[i]); status = 2; fclose(fp); continue; } where = binsrch(key, fp, rflag); printmatch(key, fp, where, numfile == 1 ? 0 : argv[i], cflag); if (status == 1 && where >= 0) { status = 0; } fclose(fp); } exit(status); }