summaryrefslogtreecommitdiff
path: root/usr.bin/sort/msort.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr.bin/sort/msort.c')
-rw-r--r--usr.bin/sort/msort.c431
1 files changed, 431 insertions, 0 deletions
diff --git a/usr.bin/sort/msort.c b/usr.bin/sort/msort.c
new file mode 100644
index 0000000..6cbcf90
--- /dev/null
+++ b/usr.bin/sort/msort.c
@@ -0,0 +1,431 @@
+/* $NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $ */
+
+/*-
+ * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Ben Harris and Jaromir Dolecek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Peter McIlroy.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "sort.h"
+#include "fsort.h"
+
+__RCSID("$NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $");
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <util.h>
+
+/* Subroutines using comparisons: merge sort and check order */
+#define DELETE (1)
+
+typedef struct mfile {
+ FILE *fp;
+ get_func_t get;
+ RECHEADER *rec;
+ u_char *end;
+} MFILE;
+
+static int cmp(RECHEADER *, RECHEADER *);
+static int insert(struct mfile **, struct mfile *, int, int);
+static void merge_sort_fstack(FILE *, put_func_t, struct field *);
+
+/*
+ * Number of files merge() can merge in one pass.
+ */
+#define MERGE_FNUM 16
+
+static struct mfile fstack[MERGE_FNUM];
+static struct mfile fstack_1[MERGE_FNUM];
+static struct mfile fstack_2[MERGE_FNUM];
+static int fstack_count, fstack_1_count, fstack_2_count;
+
+void
+save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
+{
+ FILE *mfp, *mfp1, *mfp2;
+
+ if (fstack_count == MERGE_FNUM) {
+ /* Must reduce the number of temporary files */
+ mfp = ftmp();
+ merge_sort_fstack(mfp, putrec, ftbl);
+ /* Save output in next layer */
+ if (fstack_1_count == MERGE_FNUM) {
+ mfp1 = ftmp();
+ memcpy(fstack, fstack_1, sizeof fstack);
+ merge_sort_fstack(mfp1, putrec, ftbl);
+ if (fstack_2_count == MERGE_FNUM) {
+ /* More than 4096 files! */
+ mfp2 = ftmp();
+ memcpy(fstack, fstack_2, sizeof fstack);
+ merge_sort_fstack(mfp2, putrec, ftbl);
+ fstack_2[0].fp = mfp2;
+ fstack_2_count = 1;
+ }
+ fstack_2[fstack_2_count].fp = mfp1;
+ fstack_2[fstack_2_count].get = geteasy;
+ fstack_2_count++;
+ fstack_1_count = 0;
+ }
+ fstack_1[fstack_1_count].fp = mfp;
+ fstack_1[fstack_1_count].get = geteasy;
+ fstack_1_count++;
+ fstack_count = 0;
+ }
+
+ fstack[fstack_count].fp = fp;
+ fstack[fstack_count++].get = get;
+}
+
+void
+fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
+{
+ get_func_t get = SINGL_FLD ? makeline : makekey;
+ FILE *fp;
+ int i;
+
+ for (i = 0; i < nfiles; i++) {
+ fp = fopen(filelist->names[i], "r");
+ if (fp == NULL)
+ err(2, "%s", filelist->names[i]);
+ save_for_merge(fp, get, ftbl);
+ }
+
+ merge_sort(outfp, putline, ftbl);
+}
+
+void
+merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
+{
+ int count = fstack_1_count + fstack_2_count;
+ FILE *mfp;
+ int i;
+
+ if (count == 0) {
+ /* All files in initial array */
+ merge_sort_fstack(outfp, put, ftbl);
+ return;
+ }
+
+ count += fstack_count;
+
+ /* Too many files for one merge sort */
+ for (;;) {
+ /* Sort latest 16 files */
+ i = count;
+ if (i > MERGE_FNUM)
+ i = MERGE_FNUM;
+ while (fstack_count > 0)
+ fstack[--i] = fstack[--fstack_count];
+ while (i > 0 && fstack_1_count > 0)
+ fstack[--i] = fstack_1[--fstack_1_count];
+ while (i > 0)
+ fstack[--i] = fstack_2[--fstack_2_count];
+ if (count <= MERGE_FNUM) {
+ /* Got all the data */
+ fstack_count = count;
+ merge_sort_fstack(outfp, put, ftbl);
+ return;
+ }
+ mfp = ftmp();
+ fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
+ merge_sort_fstack(mfp, putrec, ftbl);
+ fstack[0].fp = mfp;
+ fstack[0].get = geteasy;
+ fstack_count = 1;
+ count -= MERGE_FNUM - 1;
+ }
+}
+
+static void
+merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
+{
+ struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
+ RECHEADER *new_rec;
+ u_char *new_end;
+ void *tmp;
+ int c, i, nfiles;
+ size_t sz;
+
+ /* Read one record from each file (read again if a duplicate) */
+ for (nfiles = i = 0; i < fstack_count; i++) {
+ cfile = &fstack[i];
+ if (cfile->rec == NULL) {
+ cfile->rec = allocrec(NULL, DEFLLEN);
+ cfile->end = (u_char *)cfile->rec + DEFLLEN;
+ }
+ rewind(cfile->fp);
+
+ for (;;) {
+ c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
+ if (c == EOF)
+ break;
+
+ if (c == BUFFEND) {
+ /* Double buffer size */
+ sz = (cfile->end - (u_char *)cfile->rec) * 2;
+ cfile->rec = allocrec(cfile->rec, sz);
+ cfile->end = (u_char *)cfile->rec + sz;
+ continue;
+ }
+
+ if (nfiles != 0) {
+ if (insert(flist, cfile, nfiles, !DELETE))
+ /* Duplicate removed */
+ continue;
+ } else
+ flist[0] = cfile;
+ nfiles++;
+ break;
+ }
+ }
+
+ if (nfiles == 0)
+ return;
+
+ /*
+ * We now loop reading a new record from the file with the
+ * 'sorted first' existing record.
+ * As each record is added, the 'first' record is written to the
+ * output file - maintaining one record from each file in the sorted
+ * list.
+ */
+ new_rec = allocrec(NULL, DEFLLEN);
+ new_end = (u_char *)new_rec + DEFLLEN;
+ for (;;) {
+ cfile = flist[0];
+ c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
+ if (c == EOF) {
+ /* Write out last record from now-empty input */
+ put(cfile->rec, outfp);
+ if (--nfiles == 0)
+ break;
+ /* Replace from file with now-first sorted record. */
+ /* (Moving base 'flist' saves copying everything!) */
+ flist++;
+ continue;
+ }
+ if (c == BUFFEND) {
+ /* Buffer not large enough - double in size */
+ sz = (new_end - (u_char *)new_rec) * 2;
+ new_rec = allocrec(new_rec, sz);
+ new_end = (u_char *)new_rec +sz;
+ continue;
+ }
+
+ /* Swap in new buffer, saving old */
+ tmp = cfile->rec;
+ cfile->rec = new_rec;
+ new_rec = tmp;
+ tmp = cfile->end;
+ cfile->end = new_end;
+ new_end = tmp;
+
+ /* Add into sort, removing the original first entry */
+ c = insert(flist, cfile, nfiles, DELETE);
+ if (c != 0 || (UNIQUE && cfile == flist[0]
+ && cmp(new_rec, cfile->rec) == 0)) {
+ /* Was an unwanted duplicate, restore buffer */
+ tmp = cfile->rec;
+ cfile->rec = new_rec;
+ new_rec = tmp;
+ tmp = cfile->end;
+ cfile->end = new_end;
+ new_end = tmp;
+ continue;
+ }
+
+ /* Write out 'old' record */
+ put(new_rec, outfp);
+ }
+
+ free(new_rec);
+}
+
+/*
+ * if delete: inserts rec in flist, deletes flist[0];
+ * otherwise just inserts *rec in flist.
+ * Returns 1 if record is a duplicate to be ignored.
+ */
+static int
+insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
+{
+ int mid, top = ttop, bot = 0, cmpv = 1;
+
+ for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
+ cmpv = cmp(rec->rec, flist[mid]->rec);
+ if (cmpv == 0 ) {
+ if (UNIQUE)
+ /* Duplicate key, read another record */
+ /* NB: This doesn't guarantee to keep any
+ * particular record. */
+ return 1;
+ /*
+ * Apply sort by input file order.
+ * We could truncate the sort is the fileno are
+ * adjacent - but that is all too hard!
+ * The fileno cannot be equal, since we only have one
+ * record from each file (+ flist[0] which never
+ * comes here).
+ */
+ cmpv = rec < flist[mid] ? -1 : 1;
+ if (REVERSE)
+ cmpv = -cmpv;
+ }
+ if (cmpv < 0)
+ top = mid;
+ else
+ bot = mid;
+ }
+
+ /* At this point we haven't yet compared against flist[0] */
+
+ if (delete) {
+ /* flist[0] is ourselves, only the caller knows the old data */
+ if (bot != 0) {
+ memmove(flist, flist + 1, bot * sizeof(MFILE *));
+ flist[bot] = rec;
+ }
+ return 0;
+ }
+
+ /* Inserting original set of records */
+
+ if (bot == 0 && cmpv != 0) {
+ /* Doesn't match flist[1], must compare with flist[0] */
+ cmpv = cmp(rec->rec, flist[0]->rec);
+ if (cmpv == 0 && UNIQUE)
+ return 1;
+ /* Add matching keys in file order (ie new is later) */
+ if (cmpv < 0)
+ bot = -1;
+ }
+ bot++;
+ memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
+ flist[bot] = rec;
+ return 0;
+}
+
+/*
+ * check order on one file
+ */
+void
+order(struct filelist *filelist, struct field *ftbl, int quiet)
+{
+ get_func_t get = SINGL_FLD ? makeline : makekey;
+ RECHEADER *crec, *prec, *trec;
+ u_char *crec_end, *prec_end, *trec_end;
+ FILE *fp;
+ int c;
+
+ fp = fopen(filelist->names[0], "r");
+ if (fp == NULL)
+ err(2, "%s", filelist->names[0]);
+
+ crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
+ crec_end = crec->data + DEFLLEN;
+ prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
+ prec_end = prec->data + DEFLLEN;
+
+ /* XXX this does exit(0) for overlong lines */
+ if (get(fp, prec, prec_end, ftbl) != 0)
+ exit(0);
+ while (get(fp, crec, crec_end, ftbl) == 0) {
+ if (0 < (c = cmp(prec, crec))) {
+ if (quiet)
+ exit(1);
+ crec->data[crec->length-1] = 0;
+ errx(1, "found disorder: %s", crec->data+crec->offset);
+ }
+ if (UNIQUE && !c) {
+ if (quiet)
+ exit(1);
+ crec->data[crec->length-1] = 0;
+ errx(1, "found non-uniqueness: %s",
+ crec->data+crec->offset);
+ }
+ /*
+ * Swap pointers so that this record is on place pointed
+ * to by prec and new record is read to place pointed to by
+ * crec.
+ */
+ trec = prec;
+ prec = crec;
+ crec = trec;
+ trec_end = prec_end;
+ prec_end = crec_end;
+ crec_end = trec_end;
+ }
+ exit(0);
+}
+
+static int
+cmp(RECHEADER *rec1, RECHEADER *rec2)
+{
+ int len;
+ int r;
+
+ /* key is weights */
+ len = min(rec1->keylen, rec2->keylen);
+ r = memcmp(rec1->data, rec2->data, len);
+ if (r == 0)
+ r = rec1->keylen - rec2->keylen;
+ if (REVERSE)
+ r = -r;
+ return r;
+}