mirror of
https://github.com/coreutils/coreutils.git
synced 2024-11-28 04:24:45 +08:00
comm: ensure that input files are sorted
* NEWS: List new behavior. * doc/coreutils.texi (checkOrderOption) New macro for describing `--check-order' and `--nocheck-order', used in both join and comm. * src/comm.c (main): Initialize new options. (usage): Describe new options. (compare_files): Keep an extra pair of buffers for the previous line from each file to check the internal order. (check_order): If an order-check is required, compare and handle the result appropriately. (copylinebuffer): Copy a linebuffer; used for copy before read. * tests/misc/Makefile.am: List new test. * tests/misc/comm: Tests for the comm program, including the new order-checking functionality and attendant command-line options.
This commit is contained in:
parent
5f47278372
commit
98a96822d9
3
NEWS
3
NEWS
@ -9,6 +9,9 @@ GNU coreutils NEWS -*- outline -*-
|
||||
|
||||
** New features
|
||||
|
||||
comm now verifies that the inputs are in sorted order. This check can
|
||||
be turned off with the --nocheck-order option.
|
||||
|
||||
md5sum now accepts the new option, --quiet, to suppress the printing of
|
||||
'OK' messages. sha1sum, sha224sum, sha384sum, and sha512sum accept it, too.
|
||||
|
||||
|
@ -4449,6 +4449,32 @@ status that does not depend on the result of the comparison.
|
||||
Upon normal completion @command{comm} produces an exit code of zero.
|
||||
If there is an error it exits with nonzero status.
|
||||
|
||||
@macro checkOrderOption{cmd}
|
||||
If the @option{--check-order} option is given, unsorted inputs will
|
||||
cause a fatal error message. If the option @option{--nocheck-order}
|
||||
is given, unsorted inputs will never cause an error message. If
|
||||
neither of these options is given, wrongly sorted inputs are diagnosed
|
||||
only if an input file is found to contain unpairable lines. If an
|
||||
input file is diagnosed as being unsorted, the @command{\cmd\} command
|
||||
will exit with a nonzero status (and the output should not be used).
|
||||
|
||||
Forcing @command{\cmd\} to process wrongly sorted input files
|
||||
containing unpairable lines by specifying @option{--nocheck-order} is
|
||||
not guaranteed to produce any particular output. The output will
|
||||
probably not correspond with whatever you hoped it would be.
|
||||
@end macro
|
||||
@checkOrderOption{comm}
|
||||
|
||||
@table @samp
|
||||
|
||||
@item --check-order
|
||||
Fail with an error message if either input file is wrongly ordered.
|
||||
|
||||
@item --nocheck-order
|
||||
Do not check that both input files are in sorted order.
|
||||
|
||||
@end table
|
||||
|
||||
|
||||
@node tsort invocation
|
||||
@section @command{tsort}: Topological sort
|
||||
@ -5290,18 +5316,7 @@ c c1 c2
|
||||
b b1 b2
|
||||
@end example
|
||||
|
||||
If the @option{--check-order} option is given, unsorted inputs will
|
||||
cause a fatal error message. If the option @option{--nocheck-order}
|
||||
is given, unsorted inputs will never cause an error message. If
|
||||
neither of these options is given, wrongly sorted inputs are diagnosed
|
||||
only if an input file is found to contain unpairable lines. If an
|
||||
input file is diagnosed as being unsorted, the @command{join} command
|
||||
will exit with a nonzero status (and the output should not be used).
|
||||
|
||||
Forcing @command{join} to process wrongly sorted input files
|
||||
containing unpairable lines by specifying @option{--nocheck-order} is
|
||||
not guaranteed to produce any particular output. The output will
|
||||
probably not correspond with whatever you hoped it would be.
|
||||
@checkOrderOption{join}
|
||||
|
||||
The defaults are:
|
||||
@itemize
|
||||
|
168
src/comm.c
168
src/comm.c
@ -51,8 +51,31 @@ static bool only_file_2;
|
||||
/* If true, print lines that are found in both files. */
|
||||
static bool both;
|
||||
|
||||
/* If nonzero, we have seen at least one unpairable line. */
|
||||
static bool seen_unpairable;
|
||||
|
||||
/* If nonzero, we have warned about disorder in that file. */
|
||||
static bool issued_disorder_warning[2];
|
||||
|
||||
/* If nonzero, check that the input is correctly ordered. */
|
||||
static enum
|
||||
{
|
||||
CHECK_ORDER_DEFAULT,
|
||||
CHECK_ORDER_ENABLED,
|
||||
CHECK_ORDER_DISABLED
|
||||
} check_input_order;
|
||||
|
||||
enum
|
||||
{
|
||||
CHECK_ORDER_OPTION = CHAR_MAX + 1,
|
||||
NOCHECK_ORDER_OPTION
|
||||
};
|
||||
|
||||
|
||||
static struct option const long_options[] =
|
||||
{
|
||||
{"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
|
||||
{"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
|
||||
{GETOPT_HELP_OPTION_DECL},
|
||||
{GETOPT_VERSION_OPTION_DECL},
|
||||
{NULL, 0, NULL, 0}
|
||||
@ -86,6 +109,12 @@ and column three contains lines common to both files.\n\
|
||||
-1 suppress lines unique to FILE1\n\
|
||||
-2 suppress lines unique to FILE2\n\
|
||||
-3 suppress lines that appear in both files\n\
|
||||
"), stdout);
|
||||
fputs (_("\
|
||||
\n\
|
||||
--check-order check that the input is correctly sorted, even\n\
|
||||
if all input lines are pairable\n\
|
||||
--nocheck-order do not check that the input is correctly sorted\n\
|
||||
"), stdout);
|
||||
fputs (HELP_OPTION_DESCRIPTION, stdout);
|
||||
fputs (VERSION_OPTION_DESCRIPTION, stdout);
|
||||
@ -132,6 +161,53 @@ writeline (const struct linebuffer *line, FILE *stream, int class)
|
||||
fwrite (line->buffer, sizeof (char), line->length, stream);
|
||||
}
|
||||
|
||||
/* Check that successive input lines PREV and CURRENT from input file
|
||||
WHATFILE are presented in order.
|
||||
|
||||
If the user specified --nocheck-order, the check is not made.
|
||||
If the user specified --check-order, the problem is fatal.
|
||||
Otherwise (the default), the message is simply a warning.
|
||||
|
||||
A message is printed at most once per input file.
|
||||
|
||||
This funtion was copied (nearly) verbatim from `src/join.c'. */
|
||||
|
||||
static void
|
||||
check_order (const struct linebuffer *prev,
|
||||
const struct linebuffer *current,
|
||||
int whatfile)
|
||||
{
|
||||
|
||||
if (check_input_order != CHECK_ORDER_DISABLED
|
||||
&& ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
|
||||
{
|
||||
if (!issued_disorder_warning[whatfile - 1])
|
||||
{
|
||||
int order;
|
||||
|
||||
if (hard_LC_COLLATE)
|
||||
order = xmemcoll (prev->buffer, prev->length - 1,
|
||||
current->buffer, current->length - 1);
|
||||
else
|
||||
{
|
||||
size_t len = min (prev->length, current->length) - 1;
|
||||
order = memcmp (prev->buffer, current->buffer, len);
|
||||
}
|
||||
|
||||
if (order > 0)
|
||||
{
|
||||
error ((check_input_order == CHECK_ORDER_ENABLED
|
||||
? EXIT_FAILURE : 0),
|
||||
0, _("file %d is not in sorted order"), whatfile);
|
||||
|
||||
/* If we get to here, the message was just a warning, but we
|
||||
want only to issue it once. */
|
||||
issued_disorder_warning[whatfile - 1] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Compare INFILES[0] and INFILES[1].
|
||||
If either is "-", use the standard input for that file.
|
||||
Assume that each input file is sorted;
|
||||
@ -140,28 +216,42 @@ writeline (const struct linebuffer *line, FILE *stream, int class)
|
||||
static void
|
||||
compare_files (char **infiles)
|
||||
{
|
||||
/* For each file, we have one linebuffer in lb1. */
|
||||
struct linebuffer lb1[2];
|
||||
/* For each file, we have four linebuffers in lba. */
|
||||
struct linebuffer lba[2][4];
|
||||
|
||||
/* thisline[i] points to the linebuffer holding the next available line
|
||||
in file i, or is NULL if there are no lines left in that file. */
|
||||
struct linebuffer *thisline[2];
|
||||
|
||||
/* all_line[i][alt[i][0]] also points to the linebuffer holding the
|
||||
current line in file i. We keep two buffers of history around so we
|
||||
can look two lines back when we get to the end of a file. */
|
||||
struct linebuffer *all_line[2][4];
|
||||
|
||||
/* This is used to rotate through the buffers for each input file. */
|
||||
int alt[2][3];
|
||||
|
||||
/* streams[i] holds the input stream for file i. */
|
||||
FILE *streams[2];
|
||||
|
||||
int i;
|
||||
int i, j;
|
||||
|
||||
/* Initialize the storage. */
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
initbuffer (&lb1[i]);
|
||||
thisline[i] = &lb1[i];
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
initbuffer (&lba[i][j]);
|
||||
all_line[i][j] = &lba[i][j];
|
||||
}
|
||||
alt[i][0] = 0;
|
||||
alt[i][1] = 0;
|
||||
alt[i][2] = 0;
|
||||
streams[i] = (STREQ (infiles[i], "-") ? stdin : fopen (infiles[i], "r"));
|
||||
if (!streams[i])
|
||||
error (EXIT_FAILURE, errno, "%s", infiles[i]);
|
||||
|
||||
thisline[i] = readlinebuffer (thisline[i], streams[i]);
|
||||
thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]);
|
||||
if (ferror (streams[i]))
|
||||
error (EXIT_FAILURE, errno, "%s", infiles[i]);
|
||||
}
|
||||
@ -169,6 +259,7 @@ compare_files (char **infiles)
|
||||
while (thisline[0] || thisline[1])
|
||||
{
|
||||
int order;
|
||||
bool fill_up[2] = { false, false };
|
||||
|
||||
/* Compare the next available lines of the two files. */
|
||||
|
||||
@ -195,25 +286,47 @@ compare_files (char **infiles)
|
||||
/* Output the line that is lesser. */
|
||||
if (order == 0)
|
||||
writeline (thisline[1], stdout, 3);
|
||||
else if (order > 0)
|
||||
writeline (thisline[1], stdout, 2);
|
||||
else
|
||||
writeline (thisline[0], stdout, 1);
|
||||
{
|
||||
seen_unpairable = true;
|
||||
if (order > 0)
|
||||
writeline (thisline[1], stdout, 2);
|
||||
else
|
||||
writeline (thisline[0], stdout, 1);
|
||||
}
|
||||
|
||||
/* Step the file the line came from.
|
||||
If the files match, step both files. */
|
||||
if (order >= 0)
|
||||
{
|
||||
thisline[1] = readlinebuffer (thisline[1], streams[1]);
|
||||
if (ferror (streams[1]))
|
||||
error (EXIT_FAILURE, errno, "%s", infiles[1]);
|
||||
}
|
||||
fill_up[1] = true;
|
||||
if (order <= 0)
|
||||
{
|
||||
thisline[0] = readlinebuffer (thisline[0], streams[0]);
|
||||
if (ferror (streams[0]))
|
||||
error (EXIT_FAILURE, errno, "%s", infiles[0]);
|
||||
}
|
||||
fill_up[0] = true;
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
if (fill_up[i])
|
||||
{
|
||||
/* Rotate the buffers for this file. */
|
||||
alt[i][2] = alt[i][1];
|
||||
alt[i][1] = alt[i][0];
|
||||
alt[i][0] = (alt[i][0] + 1) & 0x03;
|
||||
|
||||
thisline[i] = readlinebuffer (all_line[i][alt[i][0]], streams[i]);
|
||||
|
||||
if (thisline[i])
|
||||
check_order (all_line[i][alt[i][1]], thisline[i], i + 1);
|
||||
|
||||
/* If this is the end of the file we may need to re-check
|
||||
the order of the previous two lines, since we might have
|
||||
discovered an unpairable match since we checked before. */
|
||||
else if (all_line[i][alt[i][2]]->buffer)
|
||||
check_order (all_line[i][alt[i][2]],
|
||||
all_line[i][alt[i][1]], i + 1);
|
||||
|
||||
if (ferror (streams[i]))
|
||||
error (EXIT_FAILURE, errno, "%s", infiles[i]);
|
||||
|
||||
fill_up[i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
@ -239,6 +352,10 @@ main (int argc, char **argv)
|
||||
only_file_2 = true;
|
||||
both = true;
|
||||
|
||||
seen_unpairable = false;
|
||||
issued_disorder_warning[0] = issued_disorder_warning[1] = false;
|
||||
check_input_order = CHECK_ORDER_DEFAULT;
|
||||
|
||||
while ((c = getopt_long (argc, argv, "123", long_options, NULL)) != -1)
|
||||
switch (c)
|
||||
{
|
||||
@ -254,6 +371,14 @@ main (int argc, char **argv)
|
||||
both = false;
|
||||
break;
|
||||
|
||||
case NOCHECK_ORDER_OPTION:
|
||||
check_input_order = CHECK_ORDER_DISABLED;
|
||||
break;
|
||||
|
||||
case CHECK_ORDER_OPTION:
|
||||
check_input_order = CHECK_ORDER_ENABLED;
|
||||
break;
|
||||
|
||||
case_GETOPT_HELP_CHAR;
|
||||
|
||||
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
|
||||
@ -279,5 +404,8 @@ main (int argc, char **argv)
|
||||
|
||||
compare_files (argv + optind);
|
||||
|
||||
exit (EXIT_SUCCESS);
|
||||
if (issued_disorder_warning[0] || issued_disorder_warning[1])
|
||||
exit (EXIT_FAILURE);
|
||||
else
|
||||
exit (EXIT_SUCCESS);
|
||||
}
|
||||
|
@ -146,6 +146,7 @@ TESTS = \
|
||||
misc/base64 \
|
||||
misc/basename \
|
||||
misc/close-stdout \
|
||||
misc/comm \
|
||||
misc/csplit \
|
||||
misc/date-sec \
|
||||
misc/dircolors \
|
||||
|
124
tests/misc/comm
Executable file
124
tests/misc/comm
Executable file
@ -0,0 +1,124 @@
|
||||
#!/usr/bin/perl
|
||||
# -*- perl -*-
|
||||
# Test comm
|
||||
|
||||
# Copyright (C) 2008 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
require 5.003;
|
||||
use strict;
|
||||
|
||||
(my $program_name = $0) =~ s|.*/||;
|
||||
|
||||
my $prog = 'comm';
|
||||
|
||||
# Turn off localization of executable's ouput.
|
||||
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
|
||||
|
||||
my @inputs = ({IN=>{a=>"1\n3"}}, {IN=>{b=>"2\n3"}});
|
||||
|
||||
my @Tests =
|
||||
(
|
||||
# basic operation
|
||||
['basic', @inputs, {OUT=>"1\n\t2\n\t\t3\n"} ],
|
||||
|
||||
# supress lines unique to file 1
|
||||
['opt-1', '-1', @inputs, {OUT=>"2\n\t3\n"} ],
|
||||
|
||||
# supress lines unique to file 2
|
||||
['opt-2', '-2', @inputs, {OUT=>"1\n\t3\n"} ],
|
||||
|
||||
# supress lines that appear in both files
|
||||
['opt-3', '-3', @inputs, {OUT=>"1\n\t2\n"} ],
|
||||
|
||||
# supress lines unique to file 1 and lines unique to file 2
|
||||
['opt-12', '-1', '-2', @inputs, {OUT=>"3\n"} ],
|
||||
|
||||
# supress lines unique to file 1 and those that appear in both files
|
||||
['opt-13', '-1', '-3', @inputs, {OUT=>"2\n"} ],
|
||||
|
||||
# supress lines unique to file 2 and those that appear in both files
|
||||
['opt-23', '-2', '-3', @inputs, {OUT=>"1\n"} ],
|
||||
|
||||
# supress all output (really?)
|
||||
['opt-123', '-1', '-2', '-3', @inputs, {OUT=>""} ],
|
||||
|
||||
# invalid missing command line argument (1)
|
||||
['missing-arg1', $inputs[0], {EXIT=>1},
|
||||
{ERR => "$prog: missing operand after `a'\n"
|
||||
. "Try `$prog --help' for more information.\n"}],
|
||||
|
||||
# invalid missing command line argument (both)
|
||||
['missing-arg2', {EXIT=>1},
|
||||
{ERR => "$prog: missing operand\n"
|
||||
. "Try `$prog --help' for more information.\n"}],
|
||||
|
||||
# invalid extra command line argument
|
||||
['extra-arg', @inputs, 'no-such', {EXIT=>1},
|
||||
{ERR => "$prog: extra operand `no-such'\n"
|
||||
. "Try `$prog --help' for more information.\n"}],
|
||||
|
||||
# out-of-order input
|
||||
['ooo', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1},
|
||||
{OUT => "1\n\t\t3\n\t2\n"},
|
||||
{ERR => "$prog: file 2 is not in sorted order\n"}],
|
||||
|
||||
# out-of-order input, fatal
|
||||
['ooo2', '--check-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1},
|
||||
{OUT => "1\n\t\t3\n"},
|
||||
{ERR => "$prog: file 2 is not in sorted order\n"}],
|
||||
|
||||
# out-of-order input, ignored
|
||||
['ooo3', '--nocheck-order', {IN=>{a=>"1\n3"}}, {IN=>{b=>"3\n2"}},
|
||||
{OUT => "1\n\t\t3\n\t2\n"}],
|
||||
|
||||
# both inputs out-of-order
|
||||
['ooo4', {IN=>{a=>"3\n1\n0"}}, {IN=>{b=>"3\n2\n0"}}, {EXIT=>1},
|
||||
{OUT => "\t\t3\n1\n0\n\t2\n\t0\n"},
|
||||
{ERR => "$prog: file 1 is not in sorted order\n".
|
||||
"$prog: file 2 is not in sorted order\n" }],
|
||||
|
||||
# both inputs out-of-order on last pair
|
||||
['ooo5', {IN=>{a=>"3\n1"}}, {IN=>{b=>"3\n2"}}, {EXIT=>1},
|
||||
{OUT => "\t\t3\n1\n\t2\n"},
|
||||
{ERR => "$prog: file 1 is not in sorted order\n".
|
||||
"$prog: file 2 is not in sorted order\n" }],
|
||||
|
||||
# first input out-of-order extended
|
||||
['ooo5b', {IN=>{a=>"0\n3\n1"}}, {IN=>{b=>"2\n3"}}, {EXIT=>1},
|
||||
{OUT => "0\n\t2\n\t\t3\n1\n"},
|
||||
{ERR => "$prog: file 1 is not in sorted order\n"}],
|
||||
|
||||
# second input out-of-order extended
|
||||
['ooo5c', {IN=>{a=>"0\n3"}}, {IN=>{b=>"2\n3\n1"}}, {EXIT=>1},
|
||||
{OUT => "0\n\t2\n\t\t3\n\t1\n"},
|
||||
{ERR => "$prog: file 2 is not in sorted order\n"}],
|
||||
|
||||
# both inputs out-of-order, but fully pairable
|
||||
['ooo6', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}}, {EXIT=>0},
|
||||
{OUT => "\t\t2\n\t\t1\n\t\t0\n"}],
|
||||
|
||||
# both inputs out-of-order, fully pairable, but forced to fail
|
||||
['ooo7', '--check-order', {IN=>{a=>"2\n1\n0"}}, {IN=>{b=>"2\n1\n0"}},
|
||||
{EXIT=>1},
|
||||
{OUT => "\t\t2\n"},
|
||||
{ERR => "$prog: file 1 is not in sorted order\n"}],
|
||||
);
|
||||
|
||||
my $save_temps = $ENV{DEBUG};
|
||||
my $verbose = $ENV{VERBOSE};
|
||||
|
||||
my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
|
||||
exit $fail;
|
Loading…
Reference in New Issue
Block a user