From 98a96822d9dac92de719fa340fe326e1fe0427fe Mon Sep 17 00:00:00 2001 From: Bo Borgerson Date: Sun, 20 Apr 2008 21:24:16 -0400 Subject: [PATCH] comm: ensure that input files are sorted * NEWS: List new behavior. * doc/coreutils.texi (checkOrderOption) New macro for describing `--check-order' and `--nocheck-order', used in both join and comm. * src/comm.c (main): Initialize new options. (usage): Describe new options. (compare_files): Keep an extra pair of buffers for the previous line from each file to check the internal order. (check_order): If an order-check is required, compare and handle the result appropriately. (copylinebuffer): Copy a linebuffer; used for copy before read. * tests/misc/Makefile.am: List new test. * tests/misc/comm: Tests for the comm program, including the new order-checking functionality and attendant command-line options. --- NEWS | 3 + doc/coreutils.texi | 39 +++++++---- src/comm.c | 168 +++++++++++++++++++++++++++++++++++++++------ tests/Makefile.am | 1 + tests/misc/comm | 124 +++++++++++++++++++++++++++++++++ 5 files changed, 303 insertions(+), 32 deletions(-) create mode 100755 tests/misc/comm diff --git a/NEWS b/NEWS index 97f3162a6..ba39d2faf 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,9 @@ GNU coreutils NEWS -*- outline -*- ** New features + comm now verifies that the inputs are in sorted order. This check can + be turned off with the --nocheck-order option. + md5sum now accepts the new option, --quiet, to suppress the printing of 'OK' messages. sha1sum, sha224sum, sha384sum, and sha512sum accept it, too. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index a626b4563..3bedd73c7 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -4449,6 +4449,32 @@ status that does not depend on the result of the comparison. Upon normal completion @command{comm} produces an exit code of zero. If there is an error it exits with nonzero status. +@macro checkOrderOption{cmd} +If the @option{--check-order} option is given, unsorted inputs will +cause a fatal error message. If the option @option{--nocheck-order} +is given, unsorted inputs will never cause an error message. If +neither of these options is given, wrongly sorted inputs are diagnosed +only if an input file is found to contain unpairable lines. If an +input file is diagnosed as being unsorted, the @command{\cmd\} command +will exit with a nonzero status (and the output should not be used). + +Forcing @command{\cmd\} to process wrongly sorted input files +containing unpairable lines by specifying @option{--nocheck-order} is +not guaranteed to produce any particular output. The output will +probably not correspond with whatever you hoped it would be. +@end macro +@checkOrderOption{comm} + +@table @samp + +@item --check-order +Fail with an error message if either input file is wrongly ordered. + +@item --nocheck-order +Do not check that both input files are in sorted order. + +@end table + @node tsort invocation @section @command{tsort}: Topological sort @@ -5290,18 +5316,7 @@ c c1 c2 b b1 b2 @end example -If the @option{--check-order} option is given, unsorted inputs will -cause a fatal error message. If the option @option{--nocheck-order} -is given, unsorted inputs will never cause an error message. If -neither of these options is given, wrongly sorted inputs are diagnosed -only if an input file is found to contain unpairable lines. If an -input file is diagnosed as being unsorted, the @command{join} command -will exit with a nonzero status (and the output should not be used). - -Forcing @command{join} to process wrongly sorted input files -containing unpairable lines by specifying @option{--nocheck-order} is -not guaranteed to produce any particular output. The output will -probably not correspond with whatever you hoped it would be. +@checkOrderOption{join} The defaults are: @itemize diff --git a/src/comm.c b/src/comm.c index a71d61afc..01c0b8c61 100644 --- a/src/comm.c +++ b/src/comm.c @@ -51,8 +51,31 @@ static bool only_file_2; /* If true, print lines that are found in both files. */ static bool both; +/* If nonzero, we have seen at least one unpairable line. */ +static bool seen_unpairable; + +/* If nonzero, we have warned about disorder in that file. */ +static bool issued_disorder_warning[2]; + +/* If nonzero, check that the input is correctly ordered. */ +static enum + { + CHECK_ORDER_DEFAULT, + CHECK_ORDER_ENABLED, + CHECK_ORDER_DISABLED + } check_input_order; + +enum +{ + CHECK_ORDER_OPTION = CHAR_MAX + 1, + NOCHECK_ORDER_OPTION +}; + + static struct option const long_options[] = { + {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, + {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -86,6 +109,12 @@ and column three contains lines common to both files.\n\ -1 suppress lines unique to FILE1\n\ -2 suppress lines unique to FILE2\n\ -3 suppress lines that appear in both files\n\ +"), stdout); + fputs (_("\ +\n\ + --check-order check that the input is correctly sorted, even\n\ + if all input lines are pairable\n\ + --nocheck-order do not check that the input is correctly sorted\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -132,6 +161,53 @@ writeline (const struct linebuffer *line, FILE *stream, int class) fwrite (line->buffer, sizeof (char), line->length, stream); } +/* Check that successive input lines PREV and CURRENT from input file + WHATFILE are presented in order. + + If the user specified --nocheck-order, the check is not made. + If the user specified --check-order, the problem is fatal. + Otherwise (the default), the message is simply a warning. + + A message is printed at most once per input file. + + This funtion was copied (nearly) verbatim from `src/join.c'. */ + +static void +check_order (const struct linebuffer *prev, + const struct linebuffer *current, + int whatfile) +{ + + if (check_input_order != CHECK_ORDER_DISABLED + && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable)) + { + if (!issued_disorder_warning[whatfile - 1]) + { + int order; + + if (hard_LC_COLLATE) + order = xmemcoll (prev->buffer, prev->length - 1, + current->buffer, current->length - 1); + else + { + size_t len = min (prev->length, current->length) - 1; + order = memcmp (prev->buffer, current->buffer, len); + } + + if (order > 0) + { + error ((check_input_order == CHECK_ORDER_ENABLED + ? EXIT_FAILURE : 0), + 0, _("file %d is not in sorted order"), whatfile); + + /* If we get to here, the message was just a warning, but we + want only to issue it once. */ + issued_disorder_warning[whatfile - 1] = true; + } + } + } +} + /* Compare INFILES[0] and INFILES[1]. If either is "-", use the standard input for that file. Assume that each input file is sorted; @@ -140,28 +216,42 @@ writeline (const struct linebuffer *line, FILE *stream, int class) static void compare_files (char **infiles) { - /* For each file, we have one linebuffer in lb1. */ - struct linebuffer lb1[2]; + /* For each file, we have four linebuffers in lba. */ + struct linebuffer lba[2][4]; /* thisline[i] points to the linebuffer holding the next available line in file i, or is NULL if there are no lines left in that file. */ struct linebuffer *thisline[2]; + /* all_line[i][alt[i][0]] also points to the linebuffer holding the + current line in file i. We keep two buffers of history around so we + can look two lines back when we get to the end of a file. */ + struct linebuffer *all_line[2][4]; + + /* This is used to rotate through the buffers for each input file. */ + int alt[2][3]; + /* streams[i] holds the input stream for file i. */ FILE *streams[2]; - int i; + int i, j; /* Initialize the storage. */ for (i = 0; i < 2; i++) { - initbuffer (&lb1[i]); - thisline[i] = &lb1[i]; + for (j = 0; j < 4; j++) + { + initbuffer (&lba[i][j]); + all_line[i][j] = &lba[i][j]; + } + alt[i][0] = 0; + alt[i][1] = 0; + alt[i][2] = 0; streams[i] = (STREQ (infiles[i], "-") ? stdin : fopen (infiles[i], "r")); if (!streams[i]) error (EXIT_FAILURE, errno, "%s", infiles[i]); - thisline[i] = readlinebuffer (thisline[i], streams[i]); + thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]); if (ferror (streams[i])) error (EXIT_FAILURE, errno, "%s", infiles[i]); } @@ -169,6 +259,7 @@ compare_files (char **infiles) while (thisline[0] || thisline[1]) { int order; + bool fill_up[2] = { false, false }; /* Compare the next available lines of the two files. */ @@ -195,25 +286,47 @@ compare_files (char **infiles) /* Output the line that is lesser. */ if (order == 0) writeline (thisline[1], stdout, 3); - else if (order > 0) - writeline (thisline[1], stdout, 2); else - writeline (thisline[0], stdout, 1); + { + seen_unpairable = true; + if (order > 0) + writeline (thisline[1], stdout, 2); + else + writeline (thisline[0], stdout, 1); + } /* Step the file the line came from. If the files match, step both files. */ if (order >= 0) - { - thisline[1] = readlinebuffer (thisline[1], streams[1]); - if (ferror (streams[1])) - error (EXIT_FAILURE, errno, "%s", infiles[1]); - } + fill_up[1] = true; if (order <= 0) - { - thisline[0] = readlinebuffer (thisline[0], streams[0]); - if (ferror (streams[0])) - error (EXIT_FAILURE, errno, "%s", infiles[0]); - } + fill_up[0] = true; + + for (i = 0; i < 2; i++) + if (fill_up[i]) + { + /* Rotate the buffers for this file. */ + alt[i][2] = alt[i][1]; + alt[i][1] = alt[i][0]; + alt[i][0] = (alt[i][0] + 1) & 0x03; + + thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]); + + if (thisline[i]) + check_order (all_line[i][alt[i][1]], thisline[i], i + 1); + + /* If this is the end of the file we may need to re-check + the order of the previous two lines, since we might have + discovered an unpairable match since we checked before. */ + else if (all_line[i][alt[i][2]]->buffer) + check_order (all_line[i][alt[i][2]], + all_line[i][alt[i][1]], i + 1); + + if (ferror (streams[i])) + error (EXIT_FAILURE, errno, "%s", infiles[i]); + + fill_up[i] = false; + } } for (i = 0; i < 2; i++) @@ -239,6 +352,10 @@ main (int argc, char **argv) only_file_2 = true; both = true; + seen_unpairable = false; + issued_disorder_warning[0] = issued_disorder_warning[1] = false; + check_input_order = CHECK_ORDER_DEFAULT; + while ((c = getopt_long (argc, argv, "123", long_options, NULL)) != -1) switch (c) { @@ -254,6 +371,14 @@ main (int argc, char **argv) both = false; break; + case NOCHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_DISABLED; + break; + + case CHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_ENABLED; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); @@ -279,5 +404,8 @@ main (int argc, char **argv) compare_files (argv + optind); - exit (EXIT_SUCCESS); + if (issued_disorder_warning[0] || issued_disorder_warning[1]) + exit (EXIT_FAILURE); + else + exit (EXIT_SUCCESS); } diff --git a/tests/Makefile.am b/tests/Makefile.am index d33005a5f..f07837b5d 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -146,6 +146,7 @@ TESTS = \ misc/base64 \ misc/basename \ misc/close-stdout \ + misc/comm \ misc/csplit \ misc/date-sec \ misc/dircolors \ diff --git a/tests/misc/comm b/tests/misc/comm new file mode 100755 index 000000000..39557d2c1 --- /dev/null +++ b/tests/misc/comm @@ -0,0 +1,124 @@ +#!/usr/bin/perl +# -*- perl -*- +# Test comm + +# Copyright (C) 2008 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +require 5.003; +use strict; + +(my $program_name = $0) =~ s|.*/||; + +my $prog = 'comm'; + +# Turn off localization of executable's ouput. +@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; + +my @inputs = ({IN=>{a=>"1\n3"}}, {IN=>{b=>"2\n3"}}); + +my @Tests = + ( + # basic operation + ['basic', @inputs, {OUT=>"1\n\t2\n\t\t3\n"} ], + + # supress lines unique to file 1 + ['opt-1', '-1', @inputs, {OUT=>"2\n\t3\n"} ], + + # supress lines unique to file 2 + ['opt-2', '-2', @inputs, {OUT=>"1\n\t3\n"} ], + + # supress lines that appear in both files + ['opt-3', '-3', @inputs, {OUT=>"1\n\t2\n"} ], + + # supress lines unique to file 1 and lines unique to file 2 + ['opt-12', '-1', '-2', @inputs, {OUT=>"3\n"} ], + + # supress lines unique to file 1 and those that appear in both files + ['opt-13', '-1', '-3', @inputs, {OUT=>"2\n"} ], + + # supress lines unique to file 2 and those that appear in both files + ['opt-23', '-2', '-3', @inputs, {OUT=>"1\n"} ], + + # supress all output (really?) + ['opt-123', '-1', '-2', '-3', @inputs, {OUT=>""} ], + + # invalid missing command line argument (1) + ['missing-arg1', $inputs[0], {EXIT=>1}, + {ERR => "$prog: missing operand after `a'\n" + . "Try `$prog --help' for more information.\n"}], + + # invalid missing command line argument (both) + ['missing-arg2', {EXIT=>1}, + {ERR => "$prog: missing operand\n" + . "Try `$prog --help' for more information.\n"}], + + # invalid extra command line argument + ['extra-arg', @inputs, 'no-such', {EXIT=>1}, + {ERR => "$prog: extra operand `no-such'\n" + . "Try `$prog --help' for more information.\n"}], + + # out-of-order input + ['ooo', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1}, + {OUT => "1\n\t\t3\n\t2\n"}, + {ERR => "$prog: file 2 is not in sorted order\n"}], + + # out-of-order input, fatal + ['ooo2', '--check-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1}, + {OUT => "1\n\t\t3\n"}, + {ERR => "$prog: file 2 is not in sorted order\n"}], + + # out-of-order input, ignored + ['ooo3', '--nocheck-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, + {OUT => "1\n\t\t3\n\t2\n"}], + + # both inputs out-of-order + ['ooo4', {IN=>{a=>"3\n1\n0"}}, {IN=>{b=>"3\n2\n0"}}, {EXIT=>1}, + {OUT => "\t\t3\n1\n0\n\t2\n\t0\n"}, + {ERR => "$prog: file 1 is not in sorted order\n". + "$prog: file 2 is not in sorted order\n" }], + + # both inputs out-of-order on last pair + ['ooo5', {IN=>{a=>"3\n1"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1}, + {OUT => "\t\t3\n1\n\t2\n"}, + {ERR => "$prog: file 1 is not in sorted order\n". + "$prog: file 2 is not in sorted order\n" }], + + # first input out-of-order extended + ['ooo5b', {IN=>{a=>"0\n3\n1"}}, {IN=>{b=>"2\n3"}}, {EXIT=>1}, + {OUT => "0\n\t2\n\t\t3\n1\n"}, + {ERR => "$prog: file 1 is not in sorted order\n"}], + + # second input out-of-order extended + ['ooo5c', {IN=>{a=>"0\n3"}}, {IN=>{b=>"2\n3\n1"}}, {EXIT=>1}, + {OUT => "0\n\t2\n\t\t3\n\t1\n"}, + {ERR => "$prog: file 2 is not in sorted order\n"}], + + # both inputs out-of-order, but fully pairable + ['ooo6', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}}, {EXIT=>0}, + {OUT => "\t\t2\n\t\t1\n\t\t0\n"}], + + # both inputs out-of-order, fully pairable, but forced to fail + ['ooo7', '--check-order', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}}, + {EXIT=>1}, + {OUT => "\t\t2\n"}, + {ERR => "$prog: file 1 is not in sorted order\n"}], + ); + +my $save_temps = $ENV{DEBUG}; +my $verbose = $ENV{VERBOSE}; + +my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); +exit $fail;