Improve performance of memmem

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 6.6 times on English text on AArch64 using random
needles with average size 8.

Tested against GLIBC testsuite and randomized tests.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

	* string/memmem.c (__memmem): Rewrite to improve performance.

(cherry picked from commit 680942b016)
This commit is contained in:
Wilco Dijkstra 2019-06-12 11:42:34 +01:00
parent 354f52e984
commit b58bace7ce
2 changed files with 87 additions and 40 deletions

View File

@ -1,5 +1,9 @@
2019-09-13 Wilco Dijkstra <wdijkstr@arm.com>
* string/memmem.c (__memmem): Rewrite to improve performance.
2019-06-12 Wilco Dijkstra <wdijkstr@arm.com>
* string/str-two-way.h (two_way_short_needle): Add inline to avoid
warning.
(two_way_long_needle): Block inlining.

View File

@ -15,17 +15,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* This particular implementation was written by Eric Blake, 2008. */
#ifndef _LIBC
# include <config.h>
#endif
/* Specification of memmem. */
#include <string.h>
#ifndef _LIBC
# define __builtin_expect(expr, val) (expr)
# define __memmem memmem
#endif
@ -36,51 +32,98 @@
#undef memmem
/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
HAYSTACK. */
/* Hash character pairs so a small shift table can be used. All bits of
p[0] are included, but not all bits from p[-1]. So if two equal hashes
match on p[-1], p[0] matches too. Hash collisions are harmless and result
in smaller shifts. */
#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
/* Fast memmem algorithm with guaranteed linear-time performance.
Small needles up to size 2 use a dedicated linear search. Longer needles
up to size 256 use a novel modified Horspool algorithm. It hashes pairs
of characters to quickly skip past mismatches. The main search loop only
exits if the last 2 characters match, avoiding unnecessary calls to memcmp
and allowing for a larger skip if there is no match. A self-adapting
filtering check is used to quickly detect mismatches in long needles.
By limiting the needle length to 256, the shift table can be reduced to 8
bits per entry, lowering preprocessing overhead and minimizing cache effects.
The limit also implies worst-case performance is linear.
Needles larger than 256 characters use the linear-time Two-Way algorithm. */
void *
__memmem (const void *haystack_start, size_t haystack_len,
const void *needle_start, size_t needle_len)
__memmem (const void *haystack, size_t hs_len,
const void *needle, size_t ne_len)
{
/* Abstract memory is considered to be an array of 'unsigned char' values,
not an array of 'char' values. See ISO C 99 section 6.2.6.1. */
const unsigned char *haystack = (const unsigned char *) haystack_start;
const unsigned char *needle = (const unsigned char *) needle_start;
const unsigned char *hs = (const unsigned char *) haystack;
const unsigned char *ne = (const unsigned char *) needle;
if (needle_len == 0)
/* The first occurrence of the empty string is deemed to occur at
the beginning of the string. */
return (void *) haystack;
if (ne_len == 0)
return (void *) hs;
if (ne_len == 1)
return (void *) memchr (hs, ne[0], hs_len);
/* Sanity check, otherwise the loop might search through the whole
memory. */
if (__glibc_unlikely (haystack_len < needle_len))
/* Ensure haystack length is >= needle length. */
if (hs_len < ne_len)
return NULL;
/* Use optimizations in memchr when possible, to reduce the search
size of haystack using a linear algorithm with a smaller
coefficient. However, avoid memchr for long needles, since we
can often achieve sublinear performance. */
if (needle_len < LONG_NEEDLE_THRESHOLD)
const unsigned char *end = hs + hs_len - ne_len;
if (ne_len == 2)
{
haystack = memchr (haystack, *needle, haystack_len);
if (!haystack || __builtin_expect (needle_len == 1, 0))
return (void *) haystack;
haystack_len -= haystack - (const unsigned char *) haystack_start;
if (haystack_len < needle_len)
return NULL;
/* Check whether we have a match. This improves performance since we
avoid the initialization overhead of the two-way algorithm. */
if (memcmp (haystack, needle, needle_len) == 0)
return (void *) haystack;
return two_way_short_needle (haystack, haystack_len, needle, needle_len);
uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
for (hs++; hs <= end && hw != nw; )
hw = hw << 16 | *++hs;
return hw == nw ? (void *)hs - 1 : NULL;
}
else
return two_way_long_needle (haystack, haystack_len, needle, needle_len);
/* Use Two-Way algorithm for very long needles. */
if (__builtin_expect (ne_len > 256, 0))
return two_way_long_needle (hs, hs_len, ne, ne_len);
uint8_t shift[256];
size_t tmp, shift1;
size_t m1 = ne_len - 1;
size_t offset = 0;
memset (shift, 0, sizeof (shift));
for (int i = 1; i < m1; i++)
shift[hash2 (ne + i)] = i;
/* Shift1 is the amount we can skip after matching the hash of the
needle end but not the full needle. */
shift1 = m1 - shift[hash2 (ne + m1)];
shift[hash2 (ne + m1)] = m1;
for ( ; hs <= end; )
{
/* Skip past character pairs not in the needle. */
do
{
hs += m1;
tmp = shift[hash2 (hs)];
}
while (tmp == 0 && hs <= end);
/* If the match is not at the end of the needle, shift to the end
and continue until we match the hash of the needle end. */
hs -= tmp;
if (tmp < m1)
continue;
/* Hash of the last 2 characters matches. If the needle is long,
try to quickly filter out mismatches. */
if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0)
{
if (memcmp (hs, ne, m1) == 0)
return (void *) hs;
/* Adjust filter offset when it doesn't find the mismatch. */
offset = (offset >= 8 ? offset : m1) - 8;
}
/* Skip based on matching the hash of the needle end. */
hs += shift1;
}
return NULL;
}
libc_hidden_def (__memmem)
weak_alias (__memmem, memmem)
libc_hidden_weak (memmem)
#undef LONG_NEEDLE_THRESHOLD