Get rid of the superstitious "~" in dict hashing's "i = (~hash) & mask".
The comment following used to say:
/* We use ~hash instead of hash, as degenerate hash functions, such
as for ints <sigh>, can have lots of leading zeros. It's not
really a performance risk, but better safe than sorry.
12-Dec-00 tim: so ~hash produces lots of leading ones instead --
what's the gain? */
That is, there was never a good reason for doing it. And to the contrary,
as explained on Python-Dev last December, it tended to make the *sum*
(i + incr) & mask (which is the first table index examined in case of
collison) the same "too often" across distinct hashes.
Changing to the simpler "i = hash & mask" reduced the number of string-dict
collisions (== # number of times we go around the lookup for-loop) from about
6 million to 5 million during a full run of the test suite (these are
approximate because the test suite does some random stuff from run to run).
The number of collisions in non-string dicts also decreased, but not as
dramatically.
Note that this may, for a given dict, change the order (wrt previous
releases) of entries exposed by .keys(), .values() and .items(). A number
of std tests suffered bogus failures as a result. For dicts keyed by
small ints, or (less so) by characters, the order is much more likely to be
in increasing order of key now; e.g.,
>>> d = {}
>>> for i in range(10):
... d[i] = i
...
>>> d
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}
>>>
Unfortunately. people may latch on to that in small examples and draw a
bogus conclusion.
test_support.py
Moved test_extcall's sortdict() into test_support, made it stronger,
and imported sortdict into other std tests that needed it.
test_unicode.py
Excluced cp875 from the "roundtrip over range(128)" test, because
cp875 doesn't have a well-defined inverse for unicode("?", "cp875").
See Python-Dev for excruciating details.
Cookie.py
Chaged various output functions to sort dicts before building
strings from them.
test_extcall
Fiddled the expected-result file. This remains sensitive to native
dict ordering, because, e.g., if there are multiple errors in a
keyword-arg dict (and test_extcall sets up many cases like that), the
specific error Python complains about first depends on native dict
ordering.
2001-05-13 08:19:31 +08:00
|
|
|
from test_support import verbose, sortdict
|
2000-12-24 06:08:27 +08:00
|
|
|
import warnings
|
|
|
|
warnings.filterwarnings("ignore", "the regex module is deprecated",
|
2001-01-17 11:12:01 +08:00
|
|
|
DeprecationWarning, __name__)
|
1996-12-21 06:00:21 +08:00
|
|
|
import regex
|
|
|
|
from regex_syntax import *
|
|
|
|
|
|
|
|
re = 'a+b+c+'
|
|
|
|
print 'no match:', regex.match(re, 'hello aaaabcccc world')
|
|
|
|
print 'successful search:', regex.search(re, 'hello aaaabcccc world')
|
|
|
|
try:
|
|
|
|
cre = regex.compile('\(' + re)
|
|
|
|
except regex.error:
|
|
|
|
print 'caught expected exception'
|
|
|
|
else:
|
|
|
|
print 'expected regex.error not raised'
|
|
|
|
|
|
|
|
print 'failed awk syntax:', regex.search('(a+)|(b+)', 'cdb')
|
|
|
|
prev = regex.set_syntax(RE_SYNTAX_AWK)
|
|
|
|
print 'successful awk syntax:', regex.search('(a+)|(b+)', 'cdb')
|
|
|
|
regex.set_syntax(prev)
|
|
|
|
print 'failed awk syntax:', regex.search('(a+)|(b+)', 'cdb')
|
|
|
|
|
|
|
|
re = '\(<one>[0-9]+\) *\(<two>[0-9]+\)'
|
|
|
|
print 'matching with group names and compile()'
|
|
|
|
cre = regex.compile(re)
|
|
|
|
print cre.match('801 999')
|
|
|
|
try:
|
|
|
|
print cre.group('one')
|
|
|
|
except regex.error:
|
|
|
|
print 'caught expected exception'
|
|
|
|
else:
|
|
|
|
print 'expected regex.error not raised'
|
|
|
|
|
|
|
|
print 'matching with group names and symcomp()'
|
|
|
|
cre = regex.symcomp(re)
|
|
|
|
print cre.match('801 999')
|
|
|
|
print cre.group(0)
|
|
|
|
print cre.group('one')
|
|
|
|
print cre.group(1, 2)
|
|
|
|
print cre.group('one', 'two')
|
|
|
|
print 'realpat:', cre.realpat
|
Get rid of the superstitious "~" in dict hashing's "i = (~hash) & mask".
The comment following used to say:
/* We use ~hash instead of hash, as degenerate hash functions, such
as for ints <sigh>, can have lots of leading zeros. It's not
really a performance risk, but better safe than sorry.
12-Dec-00 tim: so ~hash produces lots of leading ones instead --
what's the gain? */
That is, there was never a good reason for doing it. And to the contrary,
as explained on Python-Dev last December, it tended to make the *sum*
(i + incr) & mask (which is the first table index examined in case of
collison) the same "too often" across distinct hashes.
Changing to the simpler "i = hash & mask" reduced the number of string-dict
collisions (== # number of times we go around the lookup for-loop) from about
6 million to 5 million during a full run of the test suite (these are
approximate because the test suite does some random stuff from run to run).
The number of collisions in non-string dicts also decreased, but not as
dramatically.
Note that this may, for a given dict, change the order (wrt previous
releases) of entries exposed by .keys(), .values() and .items(). A number
of std tests suffered bogus failures as a result. For dicts keyed by
small ints, or (less so) by characters, the order is much more likely to be
in increasing order of key now; e.g.,
>>> d = {}
>>> for i in range(10):
... d[i] = i
...
>>> d
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}
>>>
Unfortunately. people may latch on to that in small examples and draw a
bogus conclusion.
test_support.py
Moved test_extcall's sortdict() into test_support, made it stronger,
and imported sortdict into other std tests that needed it.
test_unicode.py
Excluced cp875 from the "roundtrip over range(128)" test, because
cp875 doesn't have a well-defined inverse for unicode("?", "cp875").
See Python-Dev for excruciating details.
Cookie.py
Chaged various output functions to sort dicts before building
strings from them.
test_extcall
Fiddled the expected-result file. This remains sensitive to native
dict ordering, because, e.g., if there are multiple errors in a
keyword-arg dict (and test_extcall sets up many cases like that), the
specific error Python complains about first depends on native dict
ordering.
2001-05-13 08:19:31 +08:00
|
|
|
print 'groupindex:', sortdict(cre.groupindex)
|
1996-12-21 06:00:21 +08:00
|
|
|
|
|
|
|
re = 'world'
|
|
|
|
cre = regex.compile(re)
|
|
|
|
print 'not case folded search:', cre.search('HELLO WORLD')
|
|
|
|
cre = regex.compile(re, regex.casefold)
|
|
|
|
print 'case folded search:', cre.search('HELLO WORLD')
|
|
|
|
|
|
|
|
print '__members__:', cre.__members__
|
|
|
|
print 'regs:', cre.regs
|
|
|
|
print 'last:', cre.last
|
1997-05-16 21:51:48 +08:00
|
|
|
print 'translate:', len(cre.translate)
|
1996-12-21 06:00:21 +08:00
|
|
|
print 'givenpat:', cre.givenpat
|
|
|
|
|
|
|
|
print 'match with pos:', cre.match('hello world', 7)
|
|
|
|
print 'search with pos:', cre.search('hello world there world', 7)
|
|
|
|
print 'bogus group:', cre.group(0, 1, 3)
|
|
|
|
try:
|
|
|
|
print 'no name:', cre.group('one')
|
|
|
|
except regex.error:
|
|
|
|
print 'caught expected exception'
|
|
|
|
else:
|
|
|
|
print 'expected regex.error not raised'
|
1997-06-04 02:07:49 +08:00
|
|
|
|
|
|
|
from regex_tests import *
|
|
|
|
if verbose: print 'Running regex_tests test suite'
|
|
|
|
|
|
|
|
for t in tests:
|
|
|
|
pattern=s=outcome=repl=expected=None
|
|
|
|
if len(t)==5:
|
1998-03-27 03:42:58 +08:00
|
|
|
pattern, s, outcome, repl, expected = t
|
1997-06-04 02:07:49 +08:00
|
|
|
elif len(t)==3:
|
2000-10-24 01:22:08 +08:00
|
|
|
pattern, s, outcome = t
|
1997-06-04 02:07:49 +08:00
|
|
|
else:
|
1998-03-27 03:42:58 +08:00
|
|
|
raise ValueError, ('Test tuples should have 3 or 5 fields',t)
|
1997-06-04 02:07:49 +08:00
|
|
|
|
|
|
|
try:
|
1998-03-27 03:42:58 +08:00
|
|
|
obj=regex.compile(pattern)
|
1997-06-04 02:07:49 +08:00
|
|
|
except regex.error:
|
1998-03-27 03:42:58 +08:00
|
|
|
if outcome==SYNTAX_ERROR: pass # Expected a syntax error
|
2000-10-24 01:22:08 +08:00
|
|
|
else:
|
|
|
|
# Regex syntax errors aren't yet reported, so for
|
1998-03-27 03:42:58 +08:00
|
|
|
# the official test suite they'll be quietly ignored.
|
|
|
|
pass
|
|
|
|
#print '=== Syntax error:', t
|
1997-06-04 02:07:49 +08:00
|
|
|
else:
|
1998-03-27 03:42:58 +08:00
|
|
|
try:
|
|
|
|
result=obj.search(s)
|
|
|
|
except regex.error, msg:
|
|
|
|
print '=== Unexpected exception', t, repr(msg)
|
|
|
|
if outcome==SYNTAX_ERROR:
|
|
|
|
# This should have been a syntax error; forget it.
|
|
|
|
pass
|
|
|
|
elif outcome==FAIL:
|
|
|
|
if result==-1: pass # No match, as expected
|
|
|
|
else: print '=== Succeeded incorrectly', t
|
|
|
|
elif outcome==SUCCEED:
|
|
|
|
if result!=-1:
|
|
|
|
# Matched, as expected, so now we compute the
|
|
|
|
# result string and compare it to our expected result.
|
|
|
|
start, end = obj.regs[0]
|
|
|
|
found=s[start:end]
|
|
|
|
groups=obj.group(1,2,3,4,5,6,7,8,9,10)
|
|
|
|
vardict=vars()
|
|
|
|
for i in range(len(groups)):
|
|
|
|
vardict['g'+str(i+1)]=str(groups[i])
|
|
|
|
repl=eval(repl)
|
|
|
|
if repl!=expected:
|
|
|
|
print '=== grouping error', t, repr(repl)+' should be '+repr(expected)
|
|
|
|
else:
|
|
|
|
print '=== Failed incorrectly', t
|