diff options
author | Kiyoshi Aman <kiyoshi.aman+adelie@gmail.com> | 2019-02-01 22:55:37 +0000 |
---|---|---|
committer | Kiyoshi Aman <kiyoshi.aman+adelie@gmail.com> | 2019-02-03 18:22:05 -0600 |
commit | 5b57d28ffb6e1ef86b50f7d05d977826eae89bfe (patch) | |
tree | 154a22fe556b49e6927197336f8bf91b12eacd5e /usr.bin/sort/msort.c | |
download | userland-5b57d28ffb6e1ef86b50f7d05d977826eae89bfe.tar.gz userland-5b57d28ffb6e1ef86b50f7d05d977826eae89bfe.tar.bz2 userland-5b57d28ffb6e1ef86b50f7d05d977826eae89bfe.tar.xz userland-5b57d28ffb6e1ef86b50f7d05d977826eae89bfe.zip |
initial population
Diffstat (limited to 'usr.bin/sort/msort.c')
-rw-r--r-- | usr.bin/sort/msort.c | 431 |
1 files changed, 431 insertions, 0 deletions
diff --git a/usr.bin/sort/msort.c b/usr.bin/sort/msort.c new file mode 100644 index 0000000..6cbcf90 --- /dev/null +++ b/usr.bin/sort/msort.c @@ -0,0 +1,431 @@ +/* $NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $ */ + +/*- + * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Ben Harris and Jaromir Dolecek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "sort.h" +#include "fsort.h" + +__RCSID("$NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $"); + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <util.h> + +/* Subroutines using comparisons: merge sort and check order */ +#define DELETE (1) + +typedef struct mfile { + FILE *fp; + get_func_t get; + RECHEADER *rec; + u_char *end; +} MFILE; + +static int cmp(RECHEADER *, RECHEADER *); +static int insert(struct mfile **, struct mfile *, int, int); +static void merge_sort_fstack(FILE *, put_func_t, struct field *); + +/* + * Number of files merge() can merge in one pass. + */ +#define MERGE_FNUM 16 + +static struct mfile fstack[MERGE_FNUM]; +static struct mfile fstack_1[MERGE_FNUM]; +static struct mfile fstack_2[MERGE_FNUM]; +static int fstack_count, fstack_1_count, fstack_2_count; + +void +save_for_merge(FILE *fp, get_func_t get, struct field *ftbl) +{ + FILE *mfp, *mfp1, *mfp2; + + if (fstack_count == MERGE_FNUM) { + /* Must reduce the number of temporary files */ + mfp = ftmp(); + merge_sort_fstack(mfp, putrec, ftbl); + /* Save output in next layer */ + if (fstack_1_count == MERGE_FNUM) { + mfp1 = ftmp(); + memcpy(fstack, fstack_1, sizeof fstack); + merge_sort_fstack(mfp1, putrec, ftbl); + if (fstack_2_count == MERGE_FNUM) { + /* More than 4096 files! */ + mfp2 = ftmp(); + memcpy(fstack, fstack_2, sizeof fstack); + merge_sort_fstack(mfp2, putrec, ftbl); + fstack_2[0].fp = mfp2; + fstack_2_count = 1; + } + fstack_2[fstack_2_count].fp = mfp1; + fstack_2[fstack_2_count].get = geteasy; + fstack_2_count++; + fstack_1_count = 0; + } + fstack_1[fstack_1_count].fp = mfp; + fstack_1[fstack_1_count].get = geteasy; + fstack_1_count++; + fstack_count = 0; + } + + fstack[fstack_count].fp = fp; + fstack[fstack_count++].get = get; +} + +void +fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl) +{ + get_func_t get = SINGL_FLD ? makeline : makekey; + FILE *fp; + int i; + + for (i = 0; i < nfiles; i++) { + fp = fopen(filelist->names[i], "r"); + if (fp == NULL) + err(2, "%s", filelist->names[i]); + save_for_merge(fp, get, ftbl); + } + + merge_sort(outfp, putline, ftbl); +} + +void +merge_sort(FILE *outfp, put_func_t put, struct field *ftbl) +{ + int count = fstack_1_count + fstack_2_count; + FILE *mfp; + int i; + + if (count == 0) { + /* All files in initial array */ + merge_sort_fstack(outfp, put, ftbl); + return; + } + + count += fstack_count; + + /* Too many files for one merge sort */ + for (;;) { + /* Sort latest 16 files */ + i = count; + if (i > MERGE_FNUM) + i = MERGE_FNUM; + while (fstack_count > 0) + fstack[--i] = fstack[--fstack_count]; + while (i > 0 && fstack_1_count > 0) + fstack[--i] = fstack_1[--fstack_1_count]; + while (i > 0) + fstack[--i] = fstack_2[--fstack_2_count]; + if (count <= MERGE_FNUM) { + /* Got all the data */ + fstack_count = count; + merge_sort_fstack(outfp, put, ftbl); + return; + } + mfp = ftmp(); + fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count; + merge_sort_fstack(mfp, putrec, ftbl); + fstack[0].fp = mfp; + fstack[0].get = geteasy; + fstack_count = 1; + count -= MERGE_FNUM - 1; + } +} + +static void +merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl) +{ + struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile; + RECHEADER *new_rec; + u_char *new_end; + void *tmp; + int c, i, nfiles; + size_t sz; + + /* Read one record from each file (read again if a duplicate) */ + for (nfiles = i = 0; i < fstack_count; i++) { + cfile = &fstack[i]; + if (cfile->rec == NULL) { + cfile->rec = allocrec(NULL, DEFLLEN); + cfile->end = (u_char *)cfile->rec + DEFLLEN; + } + rewind(cfile->fp); + + for (;;) { + c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl); + if (c == EOF) + break; + + if (c == BUFFEND) { + /* Double buffer size */ + sz = (cfile->end - (u_char *)cfile->rec) * 2; + cfile->rec = allocrec(cfile->rec, sz); + cfile->end = (u_char *)cfile->rec + sz; + continue; + } + + if (nfiles != 0) { + if (insert(flist, cfile, nfiles, !DELETE)) + /* Duplicate removed */ + continue; + } else + flist[0] = cfile; + nfiles++; + break; + } + } + + if (nfiles == 0) + return; + + /* + * We now loop reading a new record from the file with the + * 'sorted first' existing record. + * As each record is added, the 'first' record is written to the + * output file - maintaining one record from each file in the sorted + * list. + */ + new_rec = allocrec(NULL, DEFLLEN); + new_end = (u_char *)new_rec + DEFLLEN; + for (;;) { + cfile = flist[0]; + c = cfile->get(cfile->fp, new_rec, new_end, ftbl); + if (c == EOF) { + /* Write out last record from now-empty input */ + put(cfile->rec, outfp); + if (--nfiles == 0) + break; + /* Replace from file with now-first sorted record. */ + /* (Moving base 'flist' saves copying everything!) */ + flist++; + continue; + } + if (c == BUFFEND) { + /* Buffer not large enough - double in size */ + sz = (new_end - (u_char *)new_rec) * 2; + new_rec = allocrec(new_rec, sz); + new_end = (u_char *)new_rec +sz; + continue; + } + + /* Swap in new buffer, saving old */ + tmp = cfile->rec; + cfile->rec = new_rec; + new_rec = tmp; + tmp = cfile->end; + cfile->end = new_end; + new_end = tmp; + + /* Add into sort, removing the original first entry */ + c = insert(flist, cfile, nfiles, DELETE); + if (c != 0 || (UNIQUE && cfile == flist[0] + && cmp(new_rec, cfile->rec) == 0)) { + /* Was an unwanted duplicate, restore buffer */ + tmp = cfile->rec; + cfile->rec = new_rec; + new_rec = tmp; + tmp = cfile->end; + cfile->end = new_end; + new_end = tmp; + continue; + } + + /* Write out 'old' record */ + put(new_rec, outfp); + } + + free(new_rec); +} + +/* + * if delete: inserts rec in flist, deletes flist[0]; + * otherwise just inserts *rec in flist. + * Returns 1 if record is a duplicate to be ignored. + */ +static int +insert(struct mfile **flist, struct mfile *rec, int ttop, int delete) +{ + int mid, top = ttop, bot = 0, cmpv = 1; + + for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) { + cmpv = cmp(rec->rec, flist[mid]->rec); + if (cmpv == 0 ) { + if (UNIQUE) + /* Duplicate key, read another record */ + /* NB: This doesn't guarantee to keep any + * particular record. */ + return 1; + /* + * Apply sort by input file order. + * We could truncate the sort is the fileno are + * adjacent - but that is all too hard! + * The fileno cannot be equal, since we only have one + * record from each file (+ flist[0] which never + * comes here). + */ + cmpv = rec < flist[mid] ? -1 : 1; + if (REVERSE) + cmpv = -cmpv; + } + if (cmpv < 0) + top = mid; + else + bot = mid; + } + + /* At this point we haven't yet compared against flist[0] */ + + if (delete) { + /* flist[0] is ourselves, only the caller knows the old data */ + if (bot != 0) { + memmove(flist, flist + 1, bot * sizeof(MFILE *)); + flist[bot] = rec; + } + return 0; + } + + /* Inserting original set of records */ + + if (bot == 0 && cmpv != 0) { + /* Doesn't match flist[1], must compare with flist[0] */ + cmpv = cmp(rec->rec, flist[0]->rec); + if (cmpv == 0 && UNIQUE) + return 1; + /* Add matching keys in file order (ie new is later) */ + if (cmpv < 0) + bot = -1; + } + bot++; + memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *)); + flist[bot] = rec; + return 0; +} + +/* + * check order on one file + */ +void +order(struct filelist *filelist, struct field *ftbl, int quiet) +{ + get_func_t get = SINGL_FLD ? makeline : makekey; + RECHEADER *crec, *prec, *trec; + u_char *crec_end, *prec_end, *trec_end; + FILE *fp; + int c; + + fp = fopen(filelist->names[0], "r"); + if (fp == NULL) + err(2, "%s", filelist->names[0]); + + crec = malloc(offsetof(RECHEADER, data[DEFLLEN])); + crec_end = crec->data + DEFLLEN; + prec = malloc(offsetof(RECHEADER, data[DEFLLEN])); + prec_end = prec->data + DEFLLEN; + + /* XXX this does exit(0) for overlong lines */ + if (get(fp, prec, prec_end, ftbl) != 0) + exit(0); + while (get(fp, crec, crec_end, ftbl) == 0) { + if (0 < (c = cmp(prec, crec))) { + if (quiet) + exit(1); + crec->data[crec->length-1] = 0; + errx(1, "found disorder: %s", crec->data+crec->offset); + } + if (UNIQUE && !c) { + if (quiet) + exit(1); + crec->data[crec->length-1] = 0; + errx(1, "found non-uniqueness: %s", + crec->data+crec->offset); + } + /* + * Swap pointers so that this record is on place pointed + * to by prec and new record is read to place pointed to by + * crec. + */ + trec = prec; + prec = crec; + crec = trec; + trec_end = prec_end; + prec_end = crec_end; + crec_end = trec_end; + } + exit(0); +} + +static int +cmp(RECHEADER *rec1, RECHEADER *rec2) +{ + int len; + int r; + + /* key is weights */ + len = min(rec1->keylen, rec2->keylen); + r = memcmp(rec1->data, rec2->data, len); + if (r == 0) + r = rec1->keylen - rec2->keylen; + if (REVERSE) + r = -r; + return r; +} |