/*
* sgrep version 1.0
*
* Copyright 2009 Stephen C. Losen. Distributed under the terms
* of the GNU General Public License (GPL)
*
* Sgrep (sorted grep) is a much faster alternative to traditional Unix
* grep, but with significant restrictions. 1) All input files must
* be sorted regular files. 2) The sort key must start at the beginning
* of the line. 3) The search key matches only at the beginning of
* the line. 4) No regular expression support.
*
* Sgrep uses a binary search algorithm, which is very fast, but
* requires sorted input. Each iteration of the search eliminates
* half of the remaining input. In other words, doubling the size
* of the file adds just one iteration.
*
* Sgrep seeks to the center of the file and then reads characters
* until it encounters a newline, which places the file pointer at
* the start of the next line. Sgrep compares the search key with the
* beginning of the line. If the key is greater than the line, then
* the process repeats with the second half of the file. If less than,
* then the process repeats with the first half of the file. If equal,
* then the line matches, but it may not be the earliest match, so the
* process repeats with the first half of the file. Eventually all
* of the input is eliminated and sgrep finds either no matching line
* or the first matching line. Sgrep outputs matching lines until it
* encounters a non matching line.
*
* Usage: sgrep [ -i | -n ] [ -c ] [ -b ] [ -r ] key [ sorted_file ... ]
*
* If no input file is specified, then sgrep uses stdin.
*
* The -i flag uses case insensitive byte comparison. The file must
* be sorted with "sort -f".
*
* The -n flag uses numeric comparison. The file must be sorted
* with "sort -n".
*
* The -b flag causes sgrep to ignore white space at the beginning
* of lines and at the beginning of the search key. The file must
* be sorted with "sort -b"
*
* The -c flag outputs the number of matching lines instead of the
* lines themselves.
*
* The -r flag specifies that the file is sorted in reverse
* (descending) order using "sort -r".
*
* Author: Stephen C. Losen University of Virginia
*/
/* large file support */
#ifdef _AIX
#define _LARGE_FILES
#else
#define _FILE_OFFSET_BITS 64
#endif
#include <sys/stat.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>
/* We need different comparison functions for different sort orderings */
/* exact comparison */
static int
cmp_exact(const char *key, FILE *fp) {
const unsigned char *k = (unsigned char *) key;
int c;
while (*k != 0 && (c = getc(fp)) == *k) {
k++;
}
return
*k == 0 ? 0 :
c == '\n' ? 1 :
c == EOF ? -1 : *k - c;
}
/* case insensitive comparison */
static int
cmp_case(const char *key, FILE *fp) {
const unsigned char *k = (unsigned char *) key;
int c;
while (*k != 0 && tolower(c = getc(fp)) == tolower(*k)) {
k++;
}
return
*k == 0 ? 0 :
c == '\n' ? 1 :
c == EOF ? -1 :
tolower(*k) - tolower(c);
}
/* exact comparison ignoring leading white space */
static int
cmp_exact_white(const char *key, FILE *fp) {
const unsigned char *k = (unsigned char *) key;
int c;
while (isspace(*k)) {
k++;
}
while ((c = getc(fp)) != '\n' && isspace(c))
;
while (*k != 0 && c == *k) {
k++;
c = getc(fp);
}
return
*k == 0 ? 0 :
c == '\n' ? 1 :
c == EOF ? -1 : *k - c;
}
/* case insensitive comparison ignoring leading white space */
static int
cmp_case_white(const char *key, FILE *fp) {
const unsigned char *k = (unsigned char *) key;
int c;
while (isspace(*k)) {
k++;
}
while ((c = getc(fp)) != '\n' && isspace(c))
;
while (*k != 0 && tolower(c) == tolower(*k)) {
k++;
c = getc(fp);
}
return
*k == 0 ? 0 :
c == '\n' ? 1 :
c == EOF ? -1 :
tolower(*k) - tolower(c);
}
/* numeric comparison */
static int
cmp_num(const char *key, FILE *fp) {
int c, i = 0;
char buf[128], *cp = 0;
double low, high, x;
/* read numeric string into buf */
while((c = getc(fp)) != '\n' && c != EOF) {
if (i == 0 && isspace(c)) {
continue;
}
if (i + 1 >= sizeof(buf) ||
(c != '-' && c != '.' && !isdigit(c)))
{
break;
}
buf[i++] = c;
}
buf[i] = 0;
if (c == EOF && i == 0) {
return -1;
}
/* convert to double and use numeric comparison */
x = strtod(buf, 0);
low = high = strtod(key, &cp);
if (*cp == ':') {
high = strtod(cp + 1, 0);
}
return
high < x ? -1 :
low > x ? 1 : 0;
}
static int (*compare)(const char *key, FILE *fp);
/*
* Use binary search to find the first matching line and return
* its byte position.
*/
static off_t
binsrch(const char *key, FILE *fp, int reverse) {
off_t low, med, high, start, prev = -1, ret = -1;
int cmp, c;
struct stat st;
fstat(fileno(fp), &st);
high = st.st_size - 1;
low = 0;
while (low <= high) {
med = (high + low) / 2;
fseeko(fp, med, SEEK_SET);
/* scan to start of next line if not at beginning of file */
if ((start = med) != 0) {
do {
start++;
} while ((c = getc(fp)) != '\n' && c != EOF);
}
/* compare key with current line */
if (start != prev) { /* avoid unnecessary compares */
cmp = compare(key, fp);
if (reverse != 0) {
cmp = -cmp;
}
prev = start;
}
/* eliminate half of input */
if (cmp < 0) {
high = med - 1;
}
else if (cmp > 0) {
low = start + 1;
}
else { /* success, look for earlier match */
ret = start;
high = med - 1;
}
}
return ret;
}
/* print all lines that match the key or else just the number of matches */
static void
printmatch(const char *key, FILE *fp, off_t start,
const char *fname, int cflag)
{
int c, count;
if (start >= 0) {
fseeko(fp, start, SEEK_SET);
}
for (count = 0; start >= 0 && compare(key, fp) == 0; count++) {
fseeko(fp, start, SEEK_SET);
if (cflag == 0 && fname != 0) {
fputs(fname, stdout);
fputc(':', stdout);
}
while ((c = getc(fp)) != EOF) {
start++;
if (cflag == 0) {
fputc(c, stdout);
}
if (c == '\n') {
break;
}
}
if (c == EOF) {
break;
}
}
if (cflag != 0) {
if (fname != 0) {
fputs(fname, stdout);
fputc(':', stdout);
}
printf("%d\n", count);
}
}
int
main(int argc, char **argv) {
FILE *fp;
const char *key = 0;
int i, numfile, status;
int bflag = 0, cflag = 0, iflag = 0, nflag = 0, rflag = 0;
off_t where;
struct stat st;
extern int optind, opterr;
/* parse command line options */
opterr = 0;
while ((i = getopt(argc, argv, "bcfinr")) > 0 && i != '?') {
switch(i) {
case 'b':
bflag++;
break;
case 'c':
cflag++;
break;
case 'f':
case 'i':
iflag++;
nflag = 0;
break;
case 'n':
nflag++;
iflag = 0;
break;
case 'r':
rflag++;
break;
}
}
if (i == '?' || optind >= argc) {
fputs ("Usage: sgrep [ -i | -n ] [ -c ] [ -b ] [ -r ] key "
"[ sorted_file ... ]\n", stderr);
exit(2);
}
i = optind;
key = argv[i++];
/* select the comparison function */
if (iflag != 0) {
compare = bflag == 0 ? cmp_case : cmp_case_white;
}
else if (nflag != 0) {
compare = cmp_num;
}
else {
compare = bflag == 0 ? cmp_exact : cmp_exact_white;
}
/* if no input files, then search stdin */
if ((numfile = argc - i) == 0) {
fstat(fileno(stdin), &st);
if ((st.st_mode & S_IFREG) == 0) {
fputs("sgrep: STDIN is not a regular file\n", stderr);
exit(2);
}
where = binsrch(key, stdin, rflag);
printmatch(key, stdin, where, 0, cflag);
exit(where < 0);
}
/* search each input file */
for (status = 1; i < argc; i++) {
if ((fp = fopen(argv[i], "r")) == 0) {
fprintf(stderr, "sgrep: could not open %s\n", argv[i]);
status = 2;
continue;
}
fstat(fileno(fp), &st);
if ((st.st_mode & S_IFREG) == 0) {
fprintf(stderr, "sgrep: %s is not a regular file\n", argv[i]);
status = 2;
fclose(fp);
continue;
}
where = binsrch(key, fp, rflag);
printmatch(key, fp, where, numfile == 1 ? 0 : argv[i], cflag);
if (status == 1 && where >= 0) {
status = 0;
}
fclose(fp);
}
exit(status);
}