* Beef-up testing of str.__contains__() and str.find().

* Speed-up "x in y" where x has more than one character. The existing code made excessive calls to the expensive memcmp() function. The new code uses memchr() to rapidly find a start point for memcmp(). In addition to knowing that the first character is a match, the new code also checks that the last character is a match. This significantly reduces the incidence of false starts (saving memcmp() calls and making quadratic behavior less likely). Improves the timings on: python -m timeit -r7 -s"x='a'*1000" "'ab' in x" python -m timeit -r7 -s"x='a'*1000" "'bc' in x" Once this code has proven itself, then string_find_internal() should refer to it rather than running its own version. Also, something similar may apply to unicode objects.
2024-11-24 18:34:43 +08:00 · 2005-02-20 04:07:08 +00:00 · 2005-02-20 04:07:08 +00:00 · 7cbf1bcb3e
commit 7cbf1bcb3e
parent 54c273c703
2 changed files with 50 additions and 13 deletions
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@ -122,6 +122,30 @@ class CommonTest(unittest.TestCase):
        self.checkraises(TypeError, 'hello', 'find')
        self.checkraises(TypeError, 'hello', 'find', 42)
        # For a variety of combinations,
        #    verify that str.find() matches __contains__
        #    and that the found substring is really at that location
        charset = ['', 'a', 'b', 'c']
        digits = 5
        base = len(charset)
        teststrings = set()
        for i in xrange(base ** digits):
            entry = []
            for j in xrange(digits):
                i, m = divmod(i, base)
                entry.append(charset[m])
            teststrings.add(''.join(entry))
        for i in teststrings:
            i = self.fixtype(i)
            for j in teststrings:
                loc = i.find(j)
                r1 = (loc != -1)
                r2 = j in i
                if r1 != r2:
                    self.assertEqual(r1, r2)
                if loc != -1:
                    self.assertEqual(i[loc:loc+len(j)], j)
    def test_rfind(self):
        self.checkequal(9,  'abcdefghiabc', 'rfind', 'abc')
        self.checkequal(12, 'abcdefghiabc', 'rfind', '')
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@ -1002,8 +1002,12 @@ string_slice(register PyStringObject *a, register int i, register int j)
 static int
 string_contains(PyObject *a, PyObject *el)
 {
-	const char *lhs, *rhs, *end;
+	char *s = PyString_AS_STRING(a);
-	int size;
+	const char *sub = PyString_AS_STRING(el);
 	char *last;
 	int len_sub = PyString_GET_SIZE(el);
 	int shortsub;
 	char firstchar, lastchar;
 	if (!PyString_CheckExact(el)) {
 #ifdef Py_USING_UNICODE
@ -1016,20 +1020,29 @@ string_contains(PyObject *a, PyObject *el)
 			return -1;
 		}
 	}
 	size = PyString_GET_SIZE(el);
 	rhs = PyString_AS_STRING(el);
 	lhs = PyString_AS_STRING(a);
-	/* optimize for a single character */
+	if (len_sub == 0)
-	if (size == 1)
+		return 1;
-		return memchr(lhs, *rhs, PyString_GET_SIZE(a)) != NULL;
+	/* last points to one char beyond the start of the rightmost 
-
+	   substring.  When s<last, there is still room for a possible match
-	end = lhs + (PyString_GET_SIZE(a) - size);
+	   and s[0] through s[len_sub-1] will be in bounds.
-	while (lhs <= end) {
+	   shortsub is len_sub minus the last character which is checked
-		if (memcmp(lhs++, rhs, size) == 0)
+	   separately just before the memcmp().  That check helps prevent
 	   false starts and saves the setup time for memcmp().
 	*/
 	firstchar = sub[0];
 	shortsub = len_sub - 1;
 	lastchar = sub[shortsub];
 	last = s + PyString_GET_SIZE(a) - len_sub + 1;
 	while (s < last) {
 		s = memchr(s, firstchar, last-s);
 		if (s == NULL)
 			return 0;
 		assert(s < last);
 		if (s[shortsub] == lastchar && memcmp(s, sub, shortsub) == 0)
 			return 1;
 		s++;
 	}
 	return 0;
 }