cpython/Tools/scripts/generate_token.py
Victor Stinner da5727a120
gh-92651: Remove the Include/token.h header file (#92652)
Remove the token.h header file. There was never any public tokenizer
C API. The token.h header file was only designed to be used by Python
internals.

Move Include/token.h to Include/internal/pycore_token.h. Including
this header file now requires that the Py_BUILD_CORE macro is
defined. It no longer checks for the Py_LIMITED_API macro.

Rename functions:

* PyToken_OneChar() => _PyToken_OneChar()
* PyToken_TwoChars() => _PyToken_TwoChars()
* PyToken_ThreeChars() => _PyToken_ThreeChars()
2022-05-11 23:22:50 +02:00

276 lines
6.9 KiB
Python
Executable File

#! /usr/bin/env python3
# This script generates token related files from Grammar/Tokens:
#
# Doc/library/token-list.inc
# Include/token.h
# Parser/token.c
# Lib/token.py
NT_OFFSET = 256
def load_tokens(path):
tok_names = []
string_to_tok = {}
ERRORTOKEN = None
with open(path) as fp:
for line in fp:
line = line.strip()
# strip comments
i = line.find('#')
if i >= 0:
line = line[:i].strip()
if not line:
continue
fields = line.split()
name = fields[0]
value = len(tok_names)
if name == 'ERRORTOKEN':
ERRORTOKEN = value
string = fields[1] if len(fields) > 1 else None
if string:
string = eval(string)
string_to_tok[string] = value
tok_names.append(name)
return tok_names, ERRORTOKEN, string_to_tok
def update_file(file, content):
try:
with open(file, 'r') as fobj:
if fobj.read() == content:
return False
except (OSError, ValueError):
pass
with open(file, 'w') as fobj:
fobj.write(content)
return True
token_h_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */
#ifndef Py_INTERNAL_TOKEN_H
#define Py_INTERNAL_TOKEN_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif
#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
%s\
#define N_TOKENS %d
#define NT_OFFSET %d
/* Special definitions for cooperation with parser */
#define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER)
#define ISWHITESPACE(x) ((x) == ENDMARKER || \\
(x) == NEWLINE || \\
(x) == INDENT || \\
(x) == DEDENT)
// Symbols exported for test_peg_generator
PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) _PyToken_OneChar(int);
PyAPI_FUNC(int) _PyToken_TwoChars(int, int);
PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int);
#ifdef __cplusplus
}
#endif
#endif // !Py_INTERNAL_TOKEN_H
"""
def make_h(infile, outfile='Include/internal/pycore_token.h'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
defines = []
for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
defines.append("#define %-15s %d\n" % (name, value))
if update_file(outfile, token_h_template % (
''.join(defines),
len(tok_names),
NT_OFFSET
)):
print("%s regenerated from %s" % (outfile, infile))
token_c_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */
#include "Python.h"
#include "pycore_token.h"
/* Token names */
const char * const _PyParser_TokenNames[] = {
%s\
};
/* Return the token corresponding to a single character */
int
_PyToken_OneChar(int c1)
{
%s\
return OP;
}
int
_PyToken_TwoChars(int c1, int c2)
{
%s\
return OP;
}
int
_PyToken_ThreeChars(int c1, int c2, int c3)
{
%s\
return OP;
}
"""
def generate_chars_to_token(mapping, n=1):
result = []
write = result.append
indent = ' ' * n
write(indent)
write('switch (c%d) {\n' % (n,))
for c in sorted(mapping):
write(indent)
value = mapping[c]
if isinstance(value, dict):
write("case '%s':\n" % (c,))
write(generate_chars_to_token(value, n + 1))
write(indent)
write(' break;\n')
else:
write("case '%s': return %s;\n" % (c, value))
write(indent)
write('}\n')
return ''.join(result)
def make_c(infile, outfile='Parser/token.c'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
string_to_tok['<>'] = string_to_tok['!=']
chars_to_token = {}
for string, value in string_to_tok.items():
assert 1 <= len(string) <= 3
name = tok_names[value]
m = chars_to_token.setdefault(len(string), {})
for c in string[:-1]:
m = m.setdefault(c, {})
m[string[-1]] = name
names = []
for value, name in enumerate(tok_names):
if value >= ERRORTOKEN:
name = '<%s>' % name
names.append(' "%s",\n' % name)
names.append(' "<N_TOKENS>",\n')
if update_file(outfile, token_c_template % (
''.join(names),
generate_chars_to_token(chars_to_token[1]),
generate_chars_to_token(chars_to_token[2]),
generate_chars_to_token(chars_to_token[3])
)):
print("%s regenerated from %s" % (outfile, infile))
token_inc_template = """\
.. Auto-generated by Tools/scripts/generate_token.py
%s
.. data:: N_TOKENS
.. data:: NT_OFFSET
"""
def make_rst(infile, outfile='Doc/library/token-list.inc'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
tok_to_string = {value: s for s, value in string_to_tok.items()}
names = []
for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
names.append('.. data:: %s' % (name,))
if value in tok_to_string:
names.append('')
names.append(' Token value for ``"%s"``.' % tok_to_string[value])
names.append('')
if update_file(outfile, token_inc_template % '\n'.join(names)):
print("%s regenerated from %s" % (outfile, infile))
token_py_template = '''\
"""Token constants."""
# Auto-generated by Tools/scripts/generate_token.py
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
%s
N_TOKENS = %d
# Special definitions for cooperation with parser
NT_OFFSET = %d
tok_name = {value: name
for name, value in globals().items()
if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())
EXACT_TOKEN_TYPES = {
%s
}
def ISTERMINAL(x):
return x < NT_OFFSET
def ISNONTERMINAL(x):
return x >= NT_OFFSET
def ISEOF(x):
return x == ENDMARKER
'''
def make_py(infile, outfile='Lib/token.py'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
constants = []
for value, name in enumerate(tok_names):
constants.append('%s = %d' % (name, value))
constants.insert(ERRORTOKEN,
"# These aren't used by the C tokenizer but are needed for tokenize.py")
token_types = []
for s, value in sorted(string_to_tok.items()):
token_types.append(' %r: %s,' % (s, tok_names[value]))
if update_file(outfile, token_py_template % (
'\n'.join(constants),
len(tok_names),
NT_OFFSET,
'\n'.join(token_types),
)):
print("%s regenerated from %s" % (outfile, infile))
def main(op, infile='Grammar/Tokens', *args):
make = globals()['make_' + op]
make(infile, *args)
if __name__ == '__main__':
import sys
main(*sys.argv[1:])