gh-119879: str.find(): Utilize last character gap for two-way periodic needles (#119880)

This commit is contained in:
d.grigonis 2024-06-04 10:44:49 +03:00 committed by GitHub
parent 8d63c8d47b
commit a8f1152b70
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 36 additions and 28 deletions

View File

@ -0,0 +1 @@
String search is now slightly faster for certain cases. It now utilizes last character gap (good suffix rule) for two-way periodic needles.

View File

@ -256,7 +256,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle,
The local period of the cut is the minimal length of a string w The local period of the cut is the minimal length of a string w
such that (left endswith w or w endswith left) such that (left endswith w or w endswith left)
and (right startswith w or w startswith left). and (right startswith w or w startswith right).
The Critical Factorization Theorem says that this maximal local The Critical Factorization Theorem says that this maximal local
period is the global period of the string. period is the global period of the string.
@ -337,21 +337,20 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
if (p->is_periodic) { if (p->is_periodic) {
assert(p->cut <= len_needle/2); assert(p->cut <= len_needle/2);
assert(p->cut < p->period); assert(p->cut < p->period);
p->gap = 0; // unused
} }
else { else {
// A lower bound on the period // A lower bound on the period
p->period = Py_MAX(p->cut, len_needle - p->cut) + 1; p->period = Py_MAX(p->cut, len_needle - p->cut) + 1;
// The gap between the last character and the previous }
// occurrence of an equivalent character (modulo TABLE_SIZE) // The gap between the last character and the previous
p->gap = len_needle; // occurrence of an equivalent character (modulo TABLE_SIZE)
STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; p->gap = len_needle;
for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK;
STRINGLIB_CHAR x = needle[i] & TABLE_MASK; for (Py_ssize_t i = len_needle - 2; i >= 0; i--) {
if (x == last) { STRINGLIB_CHAR x = needle[i] & TABLE_MASK;
p->gap = len_needle - 1 - i; if (x == last) {
break; p->gap = len_needle - 1 - i;
} break;
} }
} }
// Fill up a compressed Boyer-Moore "Bad Character" table // Fill up a compressed Boyer-Moore "Bad Character" table
@ -383,6 +382,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
const STRINGLIB_CHAR *window; const STRINGLIB_CHAR *window;
LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack);
Py_ssize_t gap = p->gap;
Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap);
if (p->is_periodic) { if (p->is_periodic) {
LOG("Needle is periodic.\n"); LOG("Needle is periodic.\n");
Py_ssize_t memory = 0; Py_ssize_t memory = 0;
@ -408,8 +409,16 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
Py_ssize_t i = Py_MAX(cut, memory); Py_ssize_t i = Py_MAX(cut, memory);
for (; i < len_needle; i++) { for (; i < len_needle; i++) {
if (needle[i] != window[i]) { if (needle[i] != window[i]) {
LOG("Right half does not match.\n"); if (i < gap_jump_end) {
window_last += i - cut + 1; LOG("Early right half mismatch: jump by gap.\n");
assert(gap >= i - cut + 1);
window_last += gap;
}
else {
LOG("Late right half mismatch: jump by n (>gap)\n");
assert(i - cut + 1 > gap);
window_last += i - cut + 1;
}
memory = 0; memory = 0;
goto periodicwindowloop; goto periodicwindowloop;
} }
@ -442,10 +451,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
} }
} }
else { else {
Py_ssize_t gap = p->gap;
period = Py_MAX(gap, period); period = Py_MAX(gap, period);
LOG("Needle is not periodic.\n"); LOG("Needle is not periodic.\n");
Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap);
windowloop: windowloop:
while (window_last < haystack_end) { while (window_last < haystack_end) {
for (;;) { for (;;) {
@ -463,19 +470,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
window = window_last - len_needle + 1; window = window_last - len_needle + 1;
assert((window[len_needle - 1] & TABLE_MASK) == assert((window[len_needle - 1] & TABLE_MASK) ==
(needle[len_needle - 1] & TABLE_MASK)); (needle[len_needle - 1] & TABLE_MASK));
for (Py_ssize_t i = cut; i < gap_jump_end; i++) { Py_ssize_t i = cut;
for (; i < len_needle; i++) {
if (needle[i] != window[i]) { if (needle[i] != window[i]) {
LOG("Early right half mismatch: jump by gap.\n"); if (i < gap_jump_end) {
assert(gap >= i - cut + 1); LOG("Early right half mismatch: jump by gap.\n");
window_last += gap; assert(gap >= i - cut + 1);
goto windowloop; window_last += gap;
} }
} else {
for (Py_ssize_t i = gap_jump_end; i < len_needle; i++) { LOG("Late right half mismatch: jump by n (>gap)\n");
if (needle[i] != window[i]) { assert(i - cut + 1 > gap);
LOG("Late right half mismatch.\n"); window_last += i - cut + 1;
assert(i - cut + 1 > gap); }
window_last += i - cut + 1;
goto windowloop; goto windowloop;
} }
} }