diff options
Diffstat (limited to 'scripts/sgrep.c')
-rw-r--r-- | scripts/sgrep.c | 370 |
1 files changed, 370 insertions, 0 deletions
diff --git a/scripts/sgrep.c b/scripts/sgrep.c new file mode 100644 index 000000000..d4f7d4544 --- /dev/null +++ b/scripts/sgrep.c @@ -0,0 +1,370 @@ +/* + * sgrep version 1.0 + * + * Copyright 2009 Stephen C. Losen. Distributed under the terms + * of the GNU General Public License (GPL) + * + * Sgrep (sorted grep) is a much faster alternative to traditional Unix + * grep, but with significant restrictions. 1) All input files must + * be sorted regular files. 2) The sort key must start at the beginning + * of the line. 3) The search key matches only at the beginning of + * the line. 4) No regular expression support. + * + * Sgrep uses a binary search algorithm, which is very fast, but + * requires sorted input. Each iteration of the search eliminates + * half of the remaining input. In other words, doubling the size + * of the file adds just one iteration. + * + * Sgrep seeks to the center of the file and then reads characters + * until it encounters a newline, which places the file pointer at + * the start of the next line. Sgrep compares the search key with the + * beginning of the line. If the key is greater than the line, then + * the process repeats with the second half of the file. If less than, + * then the process repeats with the first half of the file. If equal, + * then the line matches, but it may not be the earliest match, so the + * process repeats with the first half of the file. Eventually all + * of the input is eliminated and sgrep finds either no matching line + * or the first matching line. Sgrep outputs matching lines until it + * encounters a non matching line. + * + * Usage: sgrep [ -i | -n ] [ -c ] [ -b ] [ -r ] key [ sorted_file ... ] + * + * If no input file is specified, then sgrep uses stdin. + * + * The -i flag uses case insensitive byte comparison. The file must + * be sorted with "sort -f". + * + * The -n flag uses numeric comparison. The file must be sorted + * with "sort -n". + * + * The -b flag causes sgrep to ignore white space at the beginning + * of lines and at the beginning of the search key. The file must + * be sorted with "sort -b" + * + * The -c flag outputs the number of matching lines instead of the + * lines themselves. + * + * The -r flag specifies that the file is sorted in reverse + * (descending) order using "sort -r". + * + * Author: Stephen C. Losen University of Virginia + */ + +/* large file support */ + +#ifdef _AIX +#define _LARGE_FILES +#else +#define _FILE_OFFSET_BITS 64 +#endif + +#include <sys/stat.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <ctype.h> +#include <stdio.h> + +/* We need different comparison functions for different sort orderings */ + +/* exact comparison */ + +static int +cmp_exact(const char *key, FILE *fp) { + const unsigned char *k = (unsigned char *) key; + int c; + + while (*k != 0 && (c = getc(fp)) == *k) { + k++; + } + return + *k == 0 ? 0 : + c == '\n' ? 1 : + c == EOF ? -1 : *k - c; +} + +/* case insensitive comparison */ + +static int +cmp_case(const char *key, FILE *fp) { + const unsigned char *k = (unsigned char *) key; + int c; + + while (*k != 0 && tolower(c = getc(fp)) == tolower(*k)) { + k++; + } + return + *k == 0 ? 0 : + c == '\n' ? 1 : + c == EOF ? -1 : + tolower(*k) - tolower(c); +} + +/* exact comparison ignoring leading white space */ + +static int +cmp_exact_white(const char *key, FILE *fp) { + const unsigned char *k = (unsigned char *) key; + int c; + + while (isspace(*k)) { + k++; + } + while ((c = getc(fp)) != '\n' && isspace(c)) + ; + while (*k != 0 && c == *k) { + k++; + c = getc(fp); + } + return + *k == 0 ? 0 : + c == '\n' ? 1 : + c == EOF ? -1 : *k - c; +} + +/* case insensitive comparison ignoring leading white space */ + +static int +cmp_case_white(const char *key, FILE *fp) { + const unsigned char *k = (unsigned char *) key; + int c; + + while (isspace(*k)) { + k++; + } + while ((c = getc(fp)) != '\n' && isspace(c)) + ; + while (*k != 0 && tolower(c) == tolower(*k)) { + k++; + c = getc(fp); + } + return + *k == 0 ? 0 : + c == '\n' ? 1 : + c == EOF ? -1 : + tolower(*k) - tolower(c); +} + +/* numeric comparison */ + +static int +cmp_num(const char *key, FILE *fp) { + int c, i = 0; + char buf[128], *cp = 0; + double low, high, x; + + /* read numeric string into buf */ + + while((c = getc(fp)) != '\n' && c != EOF) { + if (i == 0 && isspace(c)) { + continue; + } + if (i + 1 >= sizeof(buf) || + (c != '-' && c != '.' && !isdigit(c))) + { + break; + } + buf[i++] = c; + } + buf[i] = 0; + if (c == EOF && i == 0) { + return -1; + } + + /* convert to double and use numeric comparison */ + + x = strtod(buf, 0); + low = high = strtod(key, &cp); + if (*cp == ':') { + high = strtod(cp + 1, 0); + } + return + high < x ? -1 : + low > x ? 1 : 0; +} + +static int (*compare)(const char *key, FILE *fp); + +/* + * Use binary search to find the first matching line and return + * its byte position. + */ + +static off_t +binsrch(const char *key, FILE *fp, int reverse) { + off_t low, med, high, start, prev = -1, ret = -1; + int cmp, c; + struct stat st; + + fstat(fileno(fp), &st); + high = st.st_size - 1; + low = 0; + while (low <= high) { + med = (high + low) / 2; + fseeko(fp, med, SEEK_SET); + + /* scan to start of next line if not at beginning of file */ + + if ((start = med) != 0) { + do { + start++; + } while ((c = getc(fp)) != '\n' && c != EOF); + } + + /* compare key with current line */ + + if (start != prev) { /* avoid unnecessary compares */ + cmp = compare(key, fp); + if (reverse != 0) { + cmp = -cmp; + } + prev = start; + } + + /* eliminate half of input */ + + if (cmp < 0) { + high = med - 1; + } + else if (cmp > 0) { + low = start + 1; + } + else { /* success, look for earlier match */ + ret = start; + high = med - 1; + } + } + return ret; +} + +/* print all lines that match the key or else just the number of matches */ + +static void +printmatch(const char *key, FILE *fp, off_t start, + const char *fname, int cflag) +{ + int c, count; + + if (start >= 0) { + fseeko(fp, start, SEEK_SET); + } + for (count = 0; start >= 0 && compare(key, fp) == 0; count++) { + fseeko(fp, start, SEEK_SET); + if (cflag == 0 && fname != 0) { + fputs(fname, stdout); + fputc(':', stdout); + } + while ((c = getc(fp)) != EOF) { + start++; + if (cflag == 0) { + fputc(c, stdout); + } + if (c == '\n') { + break; + } + } + if (c == EOF) { + break; + } + } + if (cflag != 0) { + if (fname != 0) { + fputs(fname, stdout); + fputc(':', stdout); + } + printf("%d\n", count); + } +} + +int +main(int argc, char **argv) { + FILE *fp; + const char *key = 0; + int i, numfile, status; + int bflag = 0, cflag = 0, iflag = 0, nflag = 0, rflag = 0; + off_t where; + struct stat st; + extern int optind, opterr; + + /* parse command line options */ + + opterr = 0; + while ((i = getopt(argc, argv, "bcfinr")) > 0 && i != '?') { + switch(i) { + case 'b': + bflag++; + break; + case 'c': + cflag++; + break; + case 'f': + case 'i': + iflag++; + nflag = 0; + break; + case 'n': + nflag++; + iflag = 0; + break; + case 'r': + rflag++; + break; + } + } + if (i == '?' || optind >= argc) { + fputs ("Usage: sgrep [ -i | -n ] [ -c ] [ -b ] [ -r ] key " + "[ sorted_file ... ]\n", stderr); + exit(2); + } + i = optind; + key = argv[i++]; + + /* select the comparison function */ + + if (iflag != 0) { + compare = bflag == 0 ? cmp_case : cmp_case_white; + } + else if (nflag != 0) { + compare = cmp_num; + } + else { + compare = bflag == 0 ? cmp_exact : cmp_exact_white; + } + + /* if no input files, then search stdin */ + + if ((numfile = argc - i) == 0) { + fstat(fileno(stdin), &st); + if ((st.st_mode & S_IFREG) == 0) { + fputs("sgrep: STDIN is not a regular file\n", stderr); + exit(2); + } + where = binsrch(key, stdin, rflag); + printmatch(key, stdin, where, 0, cflag); + exit(where < 0); + } + + /* search each input file */ + + for (status = 1; i < argc; i++) { + if ((fp = fopen(argv[i], "r")) == 0) { + fprintf(stderr, "sgrep: could not open %s\n", argv[i]); + status = 2; + continue; + } + fstat(fileno(fp), &st); + if ((st.st_mode & S_IFREG) == 0) { + fprintf(stderr, "sgrep: %s is not a regular file\n", argv[i]); + status = 2; + fclose(fp); + continue; + } + where = binsrch(key, fp, rflag); + printmatch(key, fp, where, numfile == 1 ? 0 : argv[i], cflag); + if (status == 1 && where >= 0) { + status = 0; + } + fclose(fp); + } + exit(status); +} |