* Beef-up testing of str.__contains__() and str.find().

* Speed-up "x in y" where x has more than one character.

The existing code made excessive calls to the expensive memcmp() function.
The new code uses memchr() to rapidly find a start point for memcmp().
In addition to knowing that the first character is a match, the new code
also checks that the last character is a match.  This significantly reduces
the incidence of false starts (saving memcmp() calls and making quadratic
behavior less likely).

Improves the timings on:
    python -m timeit -r7 -s"x='a'*1000" "'ab' in x"
    python -m timeit -r7 -s"x='a'*1000" "'bc' in x"

Once this code has proven itself, then string_find_internal() should refer
to it rather than running its own version.  Also, something similar may
apply to unicode objects.
This commit is contained in:
Raymond Hettinger 2005-02-20 04:07:08 +00:00
parent 54c273c703
commit 7cbf1bcb3e
2 changed files with 50 additions and 13 deletions

View File

@ -122,6 +122,30 @@ class CommonTest(unittest.TestCase):
self.checkraises(TypeError, 'hello', 'find') self.checkraises(TypeError, 'hello', 'find')
self.checkraises(TypeError, 'hello', 'find', 42) self.checkraises(TypeError, 'hello', 'find', 42)
# For a variety of combinations,
# verify that str.find() matches __contains__
# and that the found substring is really at that location
charset = ['', 'a', 'b', 'c']
digits = 5
base = len(charset)
teststrings = set()
for i in xrange(base ** digits):
entry = []
for j in xrange(digits):
i, m = divmod(i, base)
entry.append(charset[m])
teststrings.add(''.join(entry))
for i in teststrings:
i = self.fixtype(i)
for j in teststrings:
loc = i.find(j)
r1 = (loc != -1)
r2 = j in i
if r1 != r2:
self.assertEqual(r1, r2)
if loc != -1:
self.assertEqual(i[loc:loc+len(j)], j)
def test_rfind(self): def test_rfind(self):
self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc')
self.checkequal(12, 'abcdefghiabc', 'rfind', '') self.checkequal(12, 'abcdefghiabc', 'rfind', '')

View File

@ -1002,8 +1002,12 @@ string_slice(register PyStringObject *a, register int i, register int j)
static int static int
string_contains(PyObject *a, PyObject *el) string_contains(PyObject *a, PyObject *el)
{ {
const char *lhs, *rhs, *end; char *s = PyString_AS_STRING(a);
int size; const char *sub = PyString_AS_STRING(el);
char *last;
int len_sub = PyString_GET_SIZE(el);
int shortsub;
char firstchar, lastchar;
if (!PyString_CheckExact(el)) { if (!PyString_CheckExact(el)) {
#ifdef Py_USING_UNICODE #ifdef Py_USING_UNICODE
@ -1016,20 +1020,29 @@ string_contains(PyObject *a, PyObject *el)
return -1; return -1;
} }
} }
size = PyString_GET_SIZE(el);
rhs = PyString_AS_STRING(el);
lhs = PyString_AS_STRING(a);
/* optimize for a single character */ if (len_sub == 0)
if (size == 1) return 1;
return memchr(lhs, *rhs, PyString_GET_SIZE(a)) != NULL; /* last points to one char beyond the start of the rightmost
substring. When s<last, there is still room for a possible match
end = lhs + (PyString_GET_SIZE(a) - size); and s[0] through s[len_sub-1] will be in bounds.
while (lhs <= end) { shortsub is len_sub minus the last character which is checked
if (memcmp(lhs++, rhs, size) == 0) separately just before the memcmp(). That check helps prevent
false starts and saves the setup time for memcmp().
*/
firstchar = sub[0];
shortsub = len_sub - 1;
lastchar = sub[shortsub];
last = s + PyString_GET_SIZE(a) - len_sub + 1;
while (s < last) {
s = memchr(s, firstchar, last-s);
if (s == NULL)
return 0;
assert(s < last);
if (s[shortsub] == lastchar && memcmp(s, sub, shortsub) == 0)
return 1; return 1;
s++;
} }
return 0; return 0;
} }