cpython/Tools/build/generate_re_casefix.py
Victor Stinner 1863302d61
gh-97669: Create Tools/build/ directory (#97963)
Create Tools/build/ directory. Move the following scripts from
Tools/scripts/ to Tools/build/:

* check_extension_modules.py
* deepfreeze.py
* freeze_modules.py
* generate_global_objects.py
* generate_levenshtein_examples.py
* generate_opcode_h.py
* generate_re_casefix.py
* generate_sre_constants.py
* generate_stdlib_module_names.py
* generate_token.py
* parse_html5_entities.py
* smelly.py
* stable_abi.py
* umarshal.py
* update_file.py
* verify_ensurepip_wheels.py

Update references to these scripts.
2022-10-17 12:01:00 +02:00

97 lines
2.7 KiB
Python
Executable File

#! /usr/bin/env python3
# This script generates Lib/re/_casefix.py.
import collections
import sys
import unicodedata
SCRIPT_NAME = 'Tools/build/generate_re_casefix.py'
def update_file(file, content):
try:
with open(file, 'r', encoding='utf-8') as fobj:
if fobj.read() == content:
return False
except (OSError, ValueError):
pass
with open(file, 'w', encoding='utf-8') as fobj:
fobj.write(content)
return True
re_casefix_template = f"""\
# Auto-generated by {SCRIPT_NAME}.
# Maps the code of lowercased character to codes of different lowercased
# characters which have the same uppercase.
_EXTRA_CASES = {
%s
}
"""
def uname(i):
return unicodedata.name(chr(i), r'U+%04X' % i)
class hexint(int):
def __repr__(self):
return '%#06x' % self
def alpha(i):
c = chr(i)
return c if c.isalpha() else ascii(c)[1:-1]
def main(outfile='Lib/re/_casefix.py'):
# Find sets of characters which have the same uppercase.
equivalent_chars = collections.defaultdict(str)
for c in map(chr, range(sys.maxunicode + 1)):
equivalent_chars[c.upper()] += c
equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
# List of codes of lowercased characters which have the same uppercase.
equivalent_lower_codes = [sorted(t)
for s in equivalent_chars
for t in [set(ord(c.lower()) for c in s)]
if len(t) > 1]
bad_codes = []
for t in equivalent_lower_codes:
for i in t:
if i > 0xffff:
bad_codes.extend(t)
try:
bad_codes.append(ord(chr(i).upper()))
except (ValueError, TypeError):
pass
break
if bad_codes:
print('Case-insensitive matching may not work correctly for character:',
file=sys.stderr)
for i in sorted(bad_codes):
print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
file=sys.stderr)
sys.exit(1)
mapping = {i: tuple(j for j in t if i != j)
for t in equivalent_lower_codes
for i in t}
items = []
for i, t in sorted(mapping.items()):
items.append(' # %s: %s' % (
uname(i),
', '.join(map(uname, t)),
))
items.append(" %r: %r, # '%s': '%s'" % (
hexint(i),
tuple(map(hexint, t)),
alpha(i),
''.join(map(alpha, t)),
))
update_file(outfile, re_casefix_template % '\n'.join(items))
if __name__ == '__main__':
import sys
main(*sys.argv[1:])