join: improve memory management

* src/join.c (struct seq): Use a (struct line **) for `lines' rather than
one long (struct line *).  This allows individual lines to be swapped out
if necessary.
(reset_line): Get a line ready for new input.
(init_linep): Create a new line and assign it to the the pointer passed in.
(spareline[2]): Hold a spare line for each input file.
(free_spareline): Clean up.
(get_line): Take a (struct line **) instead of a (struct line *).  If the
line to be overwritten is the previous line for the current file then swap
it out for the spare.
(join): Accomodate new structure of SEQs and new parameters to get_line;
Don't free stale lines until the end -- they're re-usable now.
(dup_line): Remove function.
* NEWS: Mention the performance improvement.
This commit is contained in:
Bo Borgerson 2008-04-22 16:19:58 -04:00 committed by Jim Meyering
parent 58db1bb942
commit 94cecb5cf6
2 changed files with 90 additions and 80 deletions

2
NEWS
View File

@ -31,6 +31,8 @@ GNU coreutils NEWS -*- outline -*-
HP-UX 11, Tru64, AIX, IRIX 6.5, and Cygwin, "ls -l" now displays the presence
of an ACL on a file via a '+' sign after the mode, and "cp -p" copies ACLs.
join has significantly better performance due to better memory management
od now aligns fields across lines when printing multiple -t
specifiers, and no longer prints fields that resulted entirely from
padding the input out to the least common multiple width.

View File

@ -40,6 +40,12 @@
#define join system_join
#define SWAPLINES(a, b) do { \
struct line *tmp = a; \
a = b; \
b = tmp; \
} while (0);
/* An element of the list identifying which fields to print for each
output line. */
struct outlist
@ -76,11 +82,16 @@ struct seq
{
size_t count; /* Elements used in `lines'. */
size_t alloc; /* Elements allocated in `lines'. */
struct line *lines;
struct line **lines;
};
/* The previous line read from each file. */
static struct line *prevline[2];
static struct line *prevline[2] = {NULL, NULL};
/* This provides an extra line buffer for each file. We need these if we
try to read two consecutive lines into the same buffer, since we don't
want to overwrite the previous buffer before we check order. */
static struct line *spareline[2] = {NULL, NULL};
/* True if the LC_COLLATE locale is hard. */
static bool hard_LC_COLLATE;
@ -257,33 +268,6 @@ xfields (struct line *line)
extract_field (line, ptr, lim - ptr);
}
static struct line *
dup_line (const struct line *old)
{
struct line *newline = xmalloc (sizeof *newline);
size_t i;
/* Duplicate the buffer. */
initbuffer (&newline->buf);
newline->buf.buffer = xmalloc (old->buf.size);
newline->buf.size = old->buf.size;
memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length);
newline->buf.length = old->buf.length;
/* Duplicate the field positions. */
newline->fields = xnmalloc (old->nfields_allocated, sizeof *newline->fields);
newline->nfields = old->nfields;
newline->nfields_allocated = old->nfields_allocated;
for (i = 0; i < old->nfields; i++)
{
newline->fields[i].len = old->fields[i].len;
newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg
- old->buf.buffer);
}
return newline;
}
static void
freeline (struct line *line)
{
@ -390,49 +374,69 @@ check_order (const struct line *prev,
}
}
static inline void
reset_line (struct line *line)
{
line->nfields = 0;
}
static struct line *
init_linep (struct line **linep)
{
struct line *line = xmalloc (sizeof *line);
memset (line, '\0', sizeof *line);
*linep = line;
return line;
}
/* Read a line from FP into LINE and split it into fields.
Return true if successful. */
static bool
get_line (FILE *fp, struct line *line, int which)
get_line (FILE *fp, struct line **linep, int which)
{
initbuffer (&line->buf);
struct line *line = *linep;
if (line == prevline[which - 1])
{
SWAPLINES (line, spareline[which - 1]);
*linep = line;
}
if (line)
reset_line (line);
else
line = init_linep (linep);
if (! readlinebuffer (&line->buf, fp))
{
if (ferror (fp))
error (EXIT_FAILURE, errno, _("read error"));
free (line->buf.buffer);
line->buf.buffer = NULL;
freeline (line);
return false;
}
line->nfields_allocated = 0;
line->nfields = 0;
line->fields = NULL;
xfields (line);
if (prevline[which - 1])
{
check_order (prevline[which - 1], line, which);
freeline (prevline[which - 1]);
free (prevline[which - 1]);
}
prevline[which - 1] = dup_line (line);
check_order (prevline[which - 1], line, which);
prevline[which - 1] = line;
return true;
}
static void
free_prevline (void)
free_spareline (void)
{
size_t i;
for (i = 0; i < ARRAY_CARDINALITY (prevline); i++)
for (i = 0; i < ARRAY_CARDINALITY (spareline); i++)
{
if (prevline[i])
freeline (prevline[i]);
free (prevline[i]);
prevline[i] = NULL;
if (spareline[i])
{
freeline (spareline[i]);
free (spareline[i]);
}
}
}
@ -450,7 +454,12 @@ static bool
getseq (FILE *fp, struct seq *seq, int whichfile)
{
if (seq->count == seq->alloc)
seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
{
size_t i;
seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
for (i = seq->count; i < seq->alloc; i++)
seq->lines[i] = NULL;
}
if (get_line (fp, &seq->lines[seq->count], whichfile))
{
@ -466,10 +475,8 @@ static bool
advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
{
if (first)
{
freeline (&seq->lines[0]);
seq->count = 0;
}
seq->count = 0;
return getseq (fp, seq, whichfile);
}
@ -477,9 +484,13 @@ static void
delseq (struct seq *seq)
{
size_t i;
for (i = 0; i < seq->count; i++)
if (seq->lines[i].buf.buffer)
freeline (&seq->lines[i]);
for (i = 0; i < seq->alloc; i++)
if (seq->lines[i])
{
if (seq->lines[i]->buf.buffer)
freeline (seq->lines[i]);
free (seq->lines[i]);
}
free (seq->lines);
}
@ -592,10 +603,12 @@ static void
join (FILE *fp1, FILE *fp2)
{
struct seq seq1, seq2;
struct line line;
struct line **linep = xmalloc (sizeof *linep);
int diff;
bool eof1, eof2, checktail;
*linep = NULL;
/* Read the first line of each file. */
initseq (&seq1);
getseq (fp1, &seq1, 1);
@ -605,12 +618,12 @@ join (FILE *fp1, FILE *fp2)
while (seq1.count && seq2.count)
{
size_t i;
diff = keycmp (&seq1.lines[0], &seq2.lines[0],
diff = keycmp (seq1.lines[0], seq2.lines[0],
join_field_1, join_field_2);
if (diff < 0)
{
if (print_unpairables_1)
prjoin (&seq1.lines[0], &uni_blank);
prjoin (seq1.lines[0], &uni_blank);
advance_seq (fp1, &seq1, true, 1);
seen_unpairable = true;
continue;
@ -618,7 +631,7 @@ join (FILE *fp1, FILE *fp2)
if (diff > 0)
{
if (print_unpairables_2)
prjoin (&uni_blank, &seq2.lines[0]);
prjoin (&uni_blank, seq2.lines[0]);
advance_seq (fp2, &seq2, true, 2);
seen_unpairable = true;
continue;
@ -634,7 +647,7 @@ join (FILE *fp1, FILE *fp2)
++seq1.count;
break;
}
while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0],
while (!keycmp (seq1.lines[seq1.count - 1], seq2.lines[0],
join_field_1, join_field_2));
/* Keep reading lines from file2 as long as they continue to
@ -647,7 +660,7 @@ join (FILE *fp1, FILE *fp2)
++seq2.count;
break;
}
while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1],
while (!keycmp (seq1.lines[0], seq2.lines[seq2.count - 1],
join_field_1, join_field_2));
if (print_pairables)
@ -656,25 +669,21 @@ join (FILE *fp1, FILE *fp2)
{
size_t j;
for (j = 0; j < seq2.count - 1; ++j)
prjoin (&seq1.lines[i], &seq2.lines[j]);
prjoin (seq1.lines[i], seq2.lines[j]);
}
}
for (i = 0; i < seq1.count - 1; ++i)
freeline (&seq1.lines[i]);
if (!eof1)
{
seq1.lines[0] = seq1.lines[seq1.count - 1];
SWAPLINES (seq1.lines[0], seq1.lines[seq1.count - 1]);
seq1.count = 1;
}
else
seq1.count = 0;
for (i = 0; i < seq2.count - 1; ++i)
freeline (&seq2.lines[i]);
if (!eof2)
{
seq2.lines[0] = seq2.lines[seq2.count - 1];
SWAPLINES (seq2.lines[0], seq2.lines[seq2.count - 1]);
seq2.count = 1;
}
else
@ -694,14 +703,12 @@ join (FILE *fp1, FILE *fp2)
if ((print_unpairables_1 || checktail) && seq1.count)
{
if (print_unpairables_1)
prjoin (&seq1.lines[0], &uni_blank);
freeline (&seq1.lines[0]);
prjoin (seq1.lines[0], &uni_blank);
seen_unpairable = true;
while (get_line (fp1, &line, 1))
while (get_line (fp1, linep, 1))
{
if (print_unpairables_1)
prjoin (&line, &uni_blank);
freeline (&line);
prjoin (*linep, &uni_blank);
if (issued_disorder_warning[0] && !print_unpairables_1)
break;
}
@ -710,19 +717,20 @@ join (FILE *fp1, FILE *fp2)
if ((print_unpairables_2 || checktail) && seq2.count)
{
if (print_unpairables_2)
prjoin (&uni_blank, &seq2.lines[0]);
freeline (&seq2.lines[0]);
prjoin (&uni_blank, seq2.lines[0]);
seen_unpairable = true;
while (get_line (fp2, &line, 2))
while (get_line (fp2, linep, 2))
{
if (print_unpairables_2)
prjoin (&uni_blank, &line);
freeline (&line);
prjoin (&uni_blank, *linep);
if (issued_disorder_warning[1] && !print_unpairables_2)
break;
}
}
free (*linep);
free (linep);
delseq (&seq1);
delseq (&seq2);
}
@ -938,7 +946,7 @@ main (int argc, char **argv)
hard_LC_COLLATE = hard_locale (LC_COLLATE);
atexit (close_stdout);
atexit (free_prevline);
atexit (free_spareline);
print_pairables = true;
seen_unpairable = false;