comm: ensure that input files are sorted

* NEWS: List new behavior.
* doc/coreutils.texi (checkOrderOption) New macro for
describing `--check-order' and `--nocheck-order', used in
both join and comm.
* src/comm.c (main): Initialize new options.
(usage): Describe new options.
(compare_files): Keep an extra pair of buffers for the previous
line from each file to check the internal order.
(check_order): If an order-check is required, compare and handle
the result appropriately.
(copylinebuffer): Copy a linebuffer; used for copy before read.
* tests/misc/Makefile.am: List new test.
* tests/misc/comm: Tests for the comm program, including the
new order-checking functionality and attendant command-line options.
This commit is contained in:
Bo Borgerson 2008-04-20 21:24:16 -04:00 committed by Jim Meyering
parent 5f47278372
commit 98a96822d9
5 changed files with 303 additions and 32 deletions

3
NEWS
View File

@ -9,6 +9,9 @@ GNU coreutils NEWS -*- outline -*-
** New features
comm now verifies that the inputs are in sorted order. This check can
be turned off with the --nocheck-order option.
md5sum now accepts the new option, --quiet, to suppress the printing of
'OK' messages. sha1sum, sha224sum, sha384sum, and sha512sum accept it, too.

View File

@ -4449,6 +4449,32 @@ status that does not depend on the result of the comparison.
Upon normal completion @command{comm} produces an exit code of zero.
If there is an error it exits with nonzero status.
@macro checkOrderOption{cmd}
If the @option{--check-order} option is given, unsorted inputs will
cause a fatal error message. If the option @option{--nocheck-order}
is given, unsorted inputs will never cause an error message. If
neither of these options is given, wrongly sorted inputs are diagnosed
only if an input file is found to contain unpairable lines. If an
input file is diagnosed as being unsorted, the @command{\cmd\} command
will exit with a nonzero status (and the output should not be used).
Forcing @command{\cmd\} to process wrongly sorted input files
containing unpairable lines by specifying @option{--nocheck-order} is
not guaranteed to produce any particular output. The output will
probably not correspond with whatever you hoped it would be.
@end macro
@checkOrderOption{comm}
@table @samp
@item --check-order
Fail with an error message if either input file is wrongly ordered.
@item --nocheck-order
Do not check that both input files are in sorted order.
@end table
@node tsort invocation
@section @command{tsort}: Topological sort
@ -5290,18 +5316,7 @@ c c1 c2
b b1 b2
@end example
If the @option{--check-order} option is given, unsorted inputs will
cause a fatal error message. If the option @option{--nocheck-order}
is given, unsorted inputs will never cause an error message. If
neither of these options is given, wrongly sorted inputs are diagnosed
only if an input file is found to contain unpairable lines. If an
input file is diagnosed as being unsorted, the @command{join} command
will exit with a nonzero status (and the output should not be used).
Forcing @command{join} to process wrongly sorted input files
containing unpairable lines by specifying @option{--nocheck-order} is
not guaranteed to produce any particular output. The output will
probably not correspond with whatever you hoped it would be.
@checkOrderOption{join}
The defaults are:
@itemize

View File

@ -51,8 +51,31 @@ static bool only_file_2;
/* If true, print lines that are found in both files. */
static bool both;
/* If nonzero, we have seen at least one unpairable line. */
static bool seen_unpairable;
/* If nonzero, we have warned about disorder in that file. */
static bool issued_disorder_warning[2];
/* If nonzero, check that the input is correctly ordered. */
static enum
{
CHECK_ORDER_DEFAULT,
CHECK_ORDER_ENABLED,
CHECK_ORDER_DISABLED
} check_input_order;
enum
{
CHECK_ORDER_OPTION = CHAR_MAX + 1,
NOCHECK_ORDER_OPTION
};
static struct option const long_options[] =
{
{"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
{"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@ -86,6 +109,12 @@ and column three contains lines common to both files.\n\
-1 suppress lines unique to FILE1\n\
-2 suppress lines unique to FILE2\n\
-3 suppress lines that appear in both files\n\
"), stdout);
fputs (_("\
\n\
--check-order check that the input is correctly sorted, even\n\
if all input lines are pairable\n\
--nocheck-order do not check that the input is correctly sorted\n\
"), stdout);
fputs (HELP_OPTION_DESCRIPTION, stdout);
fputs (VERSION_OPTION_DESCRIPTION, stdout);
@ -132,6 +161,53 @@ writeline (const struct linebuffer *line, FILE *stream, int class)
fwrite (line->buffer, sizeof (char), line->length, stream);
}
/* Check that successive input lines PREV and CURRENT from input file
WHATFILE are presented in order.
If the user specified --nocheck-order, the check is not made.
If the user specified --check-order, the problem is fatal.
Otherwise (the default), the message is simply a warning.
A message is printed at most once per input file.
This funtion was copied (nearly) verbatim from `src/join.c'. */
static void
check_order (const struct linebuffer *prev,
const struct linebuffer *current,
int whatfile)
{
if (check_input_order != CHECK_ORDER_DISABLED
&& ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
{
if (!issued_disorder_warning[whatfile - 1])
{
int order;
if (hard_LC_COLLATE)
order = xmemcoll (prev->buffer, prev->length - 1,
current->buffer, current->length - 1);
else
{
size_t len = min (prev->length, current->length) - 1;
order = memcmp (prev->buffer, current->buffer, len);
}
if (order > 0)
{
error ((check_input_order == CHECK_ORDER_ENABLED
? EXIT_FAILURE : 0),
0, _("file %d is not in sorted order"), whatfile);
/* If we get to here, the message was just a warning, but we
want only to issue it once. */
issued_disorder_warning[whatfile - 1] = true;
}
}
}
}
/* Compare INFILES[0] and INFILES[1].
If either is "-", use the standard input for that file.
Assume that each input file is sorted;
@ -140,28 +216,42 @@ writeline (const struct linebuffer *line, FILE *stream, int class)
static void
compare_files (char **infiles)
{
/* For each file, we have one linebuffer in lb1. */
struct linebuffer lb1[2];
/* For each file, we have four linebuffers in lba. */
struct linebuffer lba[2][4];
/* thisline[i] points to the linebuffer holding the next available line
in file i, or is NULL if there are no lines left in that file. */
struct linebuffer *thisline[2];
/* all_line[i][alt[i][0]] also points to the linebuffer holding the
current line in file i. We keep two buffers of history around so we
can look two lines back when we get to the end of a file. */
struct linebuffer *all_line[2][4];
/* This is used to rotate through the buffers for each input file. */
int alt[2][3];
/* streams[i] holds the input stream for file i. */
FILE *streams[2];
int i;
int i, j;
/* Initialize the storage. */
for (i = 0; i < 2; i++)
{
initbuffer (&lb1[i]);
thisline[i] = &lb1[i];
for (j = 0; j < 4; j++)
{
initbuffer (&lba[i][j]);
all_line[i][j] = &lba[i][j];
}
alt[i][0] = 0;
alt[i][1] = 0;
alt[i][2] = 0;
streams[i] = (STREQ (infiles[i], "-") ? stdin : fopen (infiles[i], "r"));
if (!streams[i])
error (EXIT_FAILURE, errno, "%s", infiles[i]);
thisline[i] = readlinebuffer (thisline[i], streams[i]);
thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]);
if (ferror (streams[i]))
error (EXIT_FAILURE, errno, "%s", infiles[i]);
}
@ -169,6 +259,7 @@ compare_files (char **infiles)
while (thisline[0] || thisline[1])
{
int order;
bool fill_up[2] = { false, false };
/* Compare the next available lines of the two files. */
@ -195,25 +286,47 @@ compare_files (char **infiles)
/* Output the line that is lesser. */
if (order == 0)
writeline (thisline[1], stdout, 3);
else if (order > 0)
writeline (thisline[1], stdout, 2);
else
writeline (thisline[0], stdout, 1);
{
seen_unpairable = true;
if (order > 0)
writeline (thisline[1], stdout, 2);
else
writeline (thisline[0], stdout, 1);
}
/* Step the file the line came from.
If the files match, step both files. */
if (order >= 0)
{
thisline[1] = readlinebuffer (thisline[1], streams[1]);
if (ferror (streams[1]))
error (EXIT_FAILURE, errno, "%s", infiles[1]);
}
fill_up[1] = true;
if (order <= 0)
{
thisline[0] = readlinebuffer (thisline[0], streams[0]);
if (ferror (streams[0]))
error (EXIT_FAILURE, errno, "%s", infiles[0]);
}
fill_up[0] = true;
for (i = 0; i < 2; i++)
if (fill_up[i])
{
/* Rotate the buffers for this file. */
alt[i][2] = alt[i][1];
alt[i][1] = alt[i][0];
alt[i][0] = (alt[i][0] + 1) & 0x03;
thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]);
if (thisline[i])
check_order (all_line[i][alt[i][1]], thisline[i], i + 1);
/* If this is the end of the file we may need to re-check
the order of the previous two lines, since we might have
discovered an unpairable match since we checked before. */
else if (all_line[i][alt[i][2]]->buffer)
check_order (all_line[i][alt[i][2]],
all_line[i][alt[i][1]], i + 1);
if (ferror (streams[i]))
error (EXIT_FAILURE, errno, "%s", infiles[i]);
fill_up[i] = false;
}
}
for (i = 0; i < 2; i++)
@ -239,6 +352,10 @@ main (int argc, char **argv)
only_file_2 = true;
both = true;
seen_unpairable = false;
issued_disorder_warning[0] = issued_disorder_warning[1] = false;
check_input_order = CHECK_ORDER_DEFAULT;
while ((c = getopt_long (argc, argv, "123", long_options, NULL)) != -1)
switch (c)
{
@ -254,6 +371,14 @@ main (int argc, char **argv)
both = false;
break;
case NOCHECK_ORDER_OPTION:
check_input_order = CHECK_ORDER_DISABLED;
break;
case CHECK_ORDER_OPTION:
check_input_order = CHECK_ORDER_ENABLED;
break;
case_GETOPT_HELP_CHAR;
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
@ -279,5 +404,8 @@ main (int argc, char **argv)
compare_files (argv + optind);
exit (EXIT_SUCCESS);
if (issued_disorder_warning[0] || issued_disorder_warning[1])
exit (EXIT_FAILURE);
else
exit (EXIT_SUCCESS);
}

View File

@ -146,6 +146,7 @@ TESTS = \
misc/base64 \
misc/basename \
misc/close-stdout \
misc/comm \
misc/csplit \
misc/date-sec \
misc/dircolors \

124
tests/misc/comm Executable file
View File

@ -0,0 +1,124 @@
#!/usr/bin/perl
# -*- perl -*-
# Test comm
# Copyright (C) 2008 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
require 5.003;
use strict;
(my $program_name = $0) =~ s|.*/||;
my $prog = 'comm';
# Turn off localization of executable's ouput.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
my @inputs = ({IN=>{a=>"1\n3"}}, {IN=>{b=>"2\n3"}});
my @Tests =
(
# basic operation
['basic', @inputs, {OUT=>"1\n\t2\n\t\t3\n"} ],
# supress lines unique to file 1
['opt-1', '-1', @inputs, {OUT=>"2\n\t3\n"} ],
# supress lines unique to file 2
['opt-2', '-2', @inputs, {OUT=>"1\n\t3\n"} ],
# supress lines that appear in both files
['opt-3', '-3', @inputs, {OUT=>"1\n\t2\n"} ],
# supress lines unique to file 1 and lines unique to file 2
['opt-12', '-1', '-2', @inputs, {OUT=>"3\n"} ],
# supress lines unique to file 1 and those that appear in both files
['opt-13', '-1', '-3', @inputs, {OUT=>"2\n"} ],
# supress lines unique to file 2 and those that appear in both files
['opt-23', '-2', '-3', @inputs, {OUT=>"1\n"} ],
# supress all output (really?)
['opt-123', '-1', '-2', '-3', @inputs, {OUT=>""} ],
# invalid missing command line argument (1)
['missing-arg1', $inputs[0], {EXIT=>1},
{ERR => "$prog: missing operand after `a'\n"
. "Try `$prog --help' for more information.\n"}],
# invalid missing command line argument (both)
['missing-arg2', {EXIT=>1},
{ERR => "$prog: missing operand\n"
. "Try `$prog --help' for more information.\n"}],
# invalid extra command line argument
['extra-arg', @inputs, 'no-such', {EXIT=>1},
{ERR => "$prog: extra operand `no-such'\n"
. "Try `$prog --help' for more information.\n"}],
# out-of-order input
['ooo', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1},
{OUT => "1\n\t\t3\n\t2\n"},
{ERR => "$prog: file 2 is not in sorted order\n"}],
# out-of-order input, fatal
['ooo2', '--check-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1},
{OUT => "1\n\t\t3\n"},
{ERR => "$prog: file 2 is not in sorted order\n"}],
# out-of-order input, ignored
['ooo3', '--nocheck-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}},
{OUT => "1\n\t\t3\n\t2\n"}],
# both inputs out-of-order
['ooo4', {IN=>{a=>"3\n1\n0"}}, {IN=>{b=>"3\n2\n0"}}, {EXIT=>1},
{OUT => "\t\t3\n1\n0\n\t2\n\t0\n"},
{ERR => "$prog: file 1 is not in sorted order\n".
"$prog: file 2 is not in sorted order\n" }],
# both inputs out-of-order on last pair
['ooo5', {IN=>{a=>"3\n1"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1},
{OUT => "\t\t3\n1\n\t2\n"},
{ERR => "$prog: file 1 is not in sorted order\n".
"$prog: file 2 is not in sorted order\n" }],
# first input out-of-order extended
['ooo5b', {IN=>{a=>"0\n3\n1"}}, {IN=>{b=>"2\n3"}}, {EXIT=>1},
{OUT => "0\n\t2\n\t\t3\n1\n"},
{ERR => "$prog: file 1 is not in sorted order\n"}],
# second input out-of-order extended
['ooo5c', {IN=>{a=>"0\n3"}}, {IN=>{b=>"2\n3\n1"}}, {EXIT=>1},
{OUT => "0\n\t2\n\t\t3\n\t1\n"},
{ERR => "$prog: file 2 is not in sorted order\n"}],
# both inputs out-of-order, but fully pairable
['ooo6', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}}, {EXIT=>0},
{OUT => "\t\t2\n\t\t1\n\t\t0\n"}],
# both inputs out-of-order, fully pairable, but forced to fail
['ooo7', '--check-order', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}},
{EXIT=>1},
{OUT => "\t\t2\n"},
{ERR => "$prog: file 1 is not in sorted order\n"}],
);
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
exit $fail;