awk: fix subst code to handle "start of word" pattern correctly (needs REG_STARTEND)

function                                             old     new   delta
awk_sub                                              637     714     +77

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2023-06-08 10:42:39 +02:00
parent 113685fbcd
commit 2ca39ffd44
2 changed files with 51 additions and 26 deletions

View File

@ -2504,17 +2504,46 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
regex_t sreg, *regex;
/* True only if called to implement gensub(): */
int subexp = (src != dest);
#if defined(REG_STARTEND)
const char *src_string;
size_t src_strlen;
regexec_flags = REG_STARTEND;
#else
regexec_flags = 0;
#endif
resbuf = NULL;
residx = 0;
match_no = 0;
regexec_flags = 0;
regex = as_regex(rn, &sreg);
sp = getvar_s(src ? src : intvar[F0]);
#if defined(REG_STARTEND)
src_string = sp;
src_strlen = strlen(src_string);
#endif
replen = strlen(repl);
while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) {
int so = pmatch[0].rm_so;
int eo = pmatch[0].rm_eo;
for (;;) {
int so, eo;
#if defined(REG_STARTEND)
// REG_STARTEND: "This flag is a BSD extension, not present in POSIX"
size_t start_ofs = sp - src_string;
pmatch[0].rm_so = start_ofs;
pmatch[0].rm_eo = src_strlen;
if (regexec(regex, src_string, 10, pmatch, regexec_flags) != 0)
break;
eo = pmatch[0].rm_eo - start_ofs;
so = pmatch[0].rm_so - start_ofs;
#else
// BUG:
// gsub(/\<b*/,"") on "abc" matches empty string at "a...",
// advances sp one char (see "Empty match" comment later) to "bc"
// ... and erroneously matches "b" even though it is NOT at the word start.
enum { start_ofs = 0 };
if (regexec(regex, sp, 10, pmatch, regexec_flags) != 0)
break;
so = pmatch[0].rm_so;
eo = pmatch[0].rm_eo;
#endif
//bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);
resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);
@ -2543,7 +2572,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
}
n = pmatch[j].rm_eo - pmatch[j].rm_so;
resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
memcpy(resbuf + residx, sp + pmatch[j].rm_so, n);
memcpy(resbuf + residx, sp + pmatch[j].rm_so - start_ofs, n);
residx += n;
} else
resbuf[residx++] = c;
@ -2557,12 +2586,6 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
if (eo == so) {
/* Empty match (e.g. "b*" will match anywhere).
* Advance by one char. */
//BUG (bug 1333):
//gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc"
//... and will erroneously match "b" even though it is NOT at the word start.
//we need REG_NOTBOW but it does not exist...
//TODO: if EXTRA_COMPAT=y, use GNU matching and re_search,
//it should be able to do it correctly.
/* Subtle: this is safe only because
* qrealloc allocated at least one extra byte */
resbuf[residx] = *sp;
@ -2571,7 +2594,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in
sp++;
residx++;
}
regexec_flags = REG_NOTBOL;
regexec_flags |= REG_NOTBOL;
}
resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);

View File

@ -557,14 +557,12 @@ testing 'awk gensub backslashes \' \
'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
's=\\
\\|\\
' \
'' ''
' '' ''
testing 'awk gensub backslashes \\' \
'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
's=\\\\
\\|\\
' \
'' ''
' '' ''
# gawk 5.1.1 handles trailing unpaired \ inconsistently.
# If replace string is single \, it is used verbatim,
# but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect.
@ -572,31 +570,35 @@ testing 'awk gensub backslashes \\\' \
'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
's=\\\\\\
\\\\|\\\\
' \
'' ''
' '' ''
testing 'awk gensub backslashes \\\\' \
'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
's=\\\\\\\\
\\\\|\\\\
' \
'' ''
' '' ''
testing 'awk gensub backslashes \&' \
'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
's=\\&
&|&
' \
'' ''
' '' ''
testing 'awk gensub backslashes \0' \
'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
's=\\0
a|a
' \
'' ''
' '' ''
testing 'awk gensub backslashes \\0' \
'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
's=\\\\0
\\0|\\0
' \
' '' ''
# The "b" in "abc" should not match <b* pattern.
# Currently we use REG_STARTEND ("This flag is a BSD extension, not present in POSIX")
# to implement the code to handle this correctly, but if your libc has no REG_STARTEND,
# the alternative code mishandles this case.
testing 'awk gsub erroneous word start match' \
"awk 'BEGIN { a=\"abc\"; gsub(/\<b*/,\"\",a); print a }'" \
'abc\n' \
'' ''
exit $FAILCOUNT