mirror of
https://github.com/python/cpython.git
synced 2025-01-09 18:15:02 +08:00
4fe72f9b03
This adds support for bytes literals (b'...') to tokenize.py, and removes support for unicode literals (u'...').
259 lines
8.1 KiB
Python
259 lines
8.1 KiB
Python
"""Tests for the tokenize module.
|
|
|
|
The tests were originally written in the old Python style, where the
|
|
test output was compared to a golden file. This docstring represents
|
|
the first steps towards rewriting the entire test as a doctest.
|
|
|
|
The tests can be really simple. Given a small fragment of source
|
|
code, print out a table with the tokens. The ENDMARK is omitted for
|
|
brevity.
|
|
|
|
>>> dump_tokens("1 + 1")
|
|
NUMBER '1' (1, 0) (1, 1)
|
|
OP '+' (1, 2) (1, 3)
|
|
NUMBER '1' (1, 4) (1, 5)
|
|
|
|
A comment generates a token here, unlike in the parser module. The
|
|
comment token is followed by an NL or a NEWLINE token, depending on
|
|
whether the line contains the completion of a statement.
|
|
|
|
>>> dump_tokens("if False:\\n"
|
|
... " # NL\\n"
|
|
... " a = False # NEWLINE\\n")
|
|
NAME 'if' (1, 0) (1, 2)
|
|
NAME 'False' (1, 3) (1, 8)
|
|
OP ':' (1, 8) (1, 9)
|
|
NEWLINE '\\n' (1, 9) (1, 10)
|
|
COMMENT '# NL' (2, 4) (2, 8)
|
|
NL '\\n' (2, 8) (2, 9)
|
|
INDENT ' ' (3, 0) (3, 4)
|
|
NAME 'a' (3, 4) (3, 5)
|
|
OP '=' (3, 9) (3, 10)
|
|
NAME 'False' (3, 11) (3, 16)
|
|
COMMENT '# NEWLINE' (3, 17) (3, 26)
|
|
NEWLINE '\\n' (3, 26) (3, 27)
|
|
DEDENT '' (4, 0) (4, 0)
|
|
|
|
' # Emacs hint
|
|
|
|
There will be a bunch more tests of specific source patterns.
|
|
|
|
The tokenize module also defines an untokenize function that should
|
|
regenerate the original program text from the tokens.
|
|
|
|
There are some standard formatting practices that are easy to get right.
|
|
|
|
>>> roundtrip("if x == 1:\\n"
|
|
... " print(x)\\n")
|
|
if x == 1:
|
|
print(x)
|
|
|
|
Some people use different formatting conventions, which makes
|
|
untokenize a little trickier. Note that this test involves trailing
|
|
whitespace after the colon. Note that we use hex escapes to make the
|
|
two trailing blanks apparent in the expected output.
|
|
|
|
>>> roundtrip("if x == 1 : \\n"
|
|
... " print(x)\\n")
|
|
if x == 1 :\x20\x20
|
|
print(x)
|
|
|
|
Comments need to go in the right place.
|
|
|
|
>>> roundtrip("if x == 1:\\n"
|
|
... " # A comment by itself.\\n"
|
|
... " print(x) # Comment here, too.\\n"
|
|
... " # Another comment.\\n"
|
|
... "after_if = True\\n")
|
|
if x == 1:
|
|
# A comment by itself.
|
|
print(x) # Comment here, too.
|
|
# Another comment.
|
|
after_if = True
|
|
|
|
>>> roundtrip("if (x # The comments need to go in the right place\\n"
|
|
... " == 1):\\n"
|
|
... " print('x == 1')\\n")
|
|
if (x # The comments need to go in the right place
|
|
== 1):
|
|
print('x == 1')
|
|
|
|
"""
|
|
|
|
# ' Emacs hint
|
|
|
|
import os, glob, random, time, sys
|
|
import re
|
|
from io import StringIO
|
|
from test.test_support import (verbose, findfile, is_resource_enabled,
|
|
TestFailed)
|
|
from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
|
|
ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
|
|
|
|
# How much time in seconds can pass before we print a 'Still working' message.
|
|
_PRINT_WORKING_MSG_INTERVAL = 5 * 60
|
|
|
|
# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
|
|
# is tokenized, converted back to source code via tokenize.untokenize(),
|
|
# and tokenized again from the latter. The test fails if the second
|
|
# tokenization doesn't match the first.
|
|
def test_roundtrip(f):
|
|
## print('Testing:', f)
|
|
# Get the encoding first
|
|
fobj = open(f, encoding="latin-1")
|
|
first2lines = fobj.readline() + fobj.readline()
|
|
fobj.close()
|
|
m = re.search(r"coding:\s*(\S+)", first2lines)
|
|
if m:
|
|
encoding = m.group(1)
|
|
## print(" coding:", encoding)
|
|
else:
|
|
encoding = "utf-8"
|
|
fobj = open(f, encoding=encoding)
|
|
try:
|
|
fulltok = list(generate_tokens(fobj.readline))
|
|
finally:
|
|
fobj.close()
|
|
|
|
t1 = [tok[:2] for tok in fulltok]
|
|
newtext = untokenize(t1)
|
|
readline = iter(newtext.splitlines(1)).__next__
|
|
t2 = [tok[:2] for tok in generate_tokens(readline)]
|
|
if t1 != t2:
|
|
raise TestFailed("untokenize() roundtrip failed for %r" % f)
|
|
|
|
def dump_tokens(s):
|
|
"""Print out the tokens in s in a table format.
|
|
|
|
The ENDMARKER is omitted.
|
|
"""
|
|
f = StringIO(s)
|
|
for type, token, start, end, line in generate_tokens(f.readline):
|
|
if type == ENDMARKER:
|
|
break
|
|
type = tok_name[type]
|
|
print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
|
|
|
|
def roundtrip(s):
|
|
f = StringIO(s)
|
|
source = untokenize(generate_tokens(f.readline))
|
|
print(source, end="")
|
|
|
|
# This is an example from the docs, set up as a doctest.
|
|
def decistmt(s):
|
|
"""Substitute Decimals for floats in a string of statements.
|
|
|
|
>>> from decimal import Decimal
|
|
>>> s = 'print(+21.3e-5*-.1234/81.7)'
|
|
>>> decistmt(s)
|
|
"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
|
|
|
|
The format of the exponent is inherited from the platform C library.
|
|
Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
|
|
we're only showing 12 digits, and the 13th isn't close to 5, the
|
|
rest of the output should be platform-independent.
|
|
|
|
>>> exec(s) #doctest: +ELLIPSIS
|
|
-3.21716034272e-0...7
|
|
|
|
Output from calculations with Decimal should be identical across all
|
|
platforms.
|
|
|
|
>>> exec(decistmt(s))
|
|
-3.217160342717258261933904529E-7
|
|
"""
|
|
|
|
result = []
|
|
g = generate_tokens(StringIO(s).readline) # tokenize the string
|
|
for toknum, tokval, _, _, _ in g:
|
|
if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
|
|
result.extend([
|
|
(NAME, 'Decimal'),
|
|
(OP, '('),
|
|
(STRING, repr(tokval)),
|
|
(OP, ')')
|
|
])
|
|
else:
|
|
result.append((toknum, tokval))
|
|
return untokenize(result)
|
|
|
|
def test_main():
|
|
if verbose:
|
|
print('starting...')
|
|
|
|
next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
|
|
|
|
# Validate the tokenize_tests.txt file.
|
|
# This makes sure it compiles, and displays any errors in it.
|
|
f = open(findfile('tokenize_tests.txt'))
|
|
sf = f.read()
|
|
f.close()
|
|
cf = compile(sf, 'tokenize_tests.txt', 'exec')
|
|
|
|
# This displays the tokenization of tokenize_tests.py to stdout, and
|
|
# regrtest.py checks that this equals the expected output (in the
|
|
# test/output/ directory).
|
|
f = open(findfile('tokenize_tests.txt'))
|
|
tokenize(f.readline)
|
|
f.close()
|
|
|
|
# Now run test_roundtrip() over test_tokenize.py too, and over all
|
|
# (if the "compiler" resource is enabled) or a small random sample (if
|
|
# "compiler" is not enabled) of the test*.py files.
|
|
f = findfile('test_tokenize.py')
|
|
if verbose:
|
|
print(' round trip: ', f, file=sys.__stdout__)
|
|
test_roundtrip(f)
|
|
|
|
testdir = os.path.dirname(f) or os.curdir
|
|
testfiles = glob.glob(testdir + os.sep + 'test*.py')
|
|
if not is_resource_enabled('compiler'):
|
|
testfiles = random.sample(testfiles, 10)
|
|
|
|
for f in testfiles:
|
|
# Print still working message since this test can be really slow
|
|
if verbose:
|
|
print(' round trip: ', f, file=sys.__stdout__)
|
|
if next_time <= time.time():
|
|
next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
|
|
print(' test_main still working, be patient...', file=sys.__stdout__)
|
|
sys.__stdout__.flush()
|
|
|
|
test_roundtrip(f)
|
|
|
|
# Test detecton of IndentationError.
|
|
sampleBadText = """\
|
|
def foo():
|
|
bar
|
|
baz
|
|
"""
|
|
|
|
try:
|
|
for tok in generate_tokens(StringIO(sampleBadText).readline):
|
|
pass
|
|
except IndentationError:
|
|
pass
|
|
else:
|
|
raise TestFailed("Did not detect IndentationError:")
|
|
|
|
# Run the doctests in this module.
|
|
from test import test_tokenize # i.e., this module
|
|
from test.test_support import run_doctest
|
|
run_doctest(test_tokenize, verbose)
|
|
|
|
if verbose:
|
|
print('finished')
|
|
|
|
def test_rarrow():
|
|
"""
|
|
This function exists solely to test the tokenization of the RARROW
|
|
operator.
|
|
|
|
>>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
|
|
1,0-1,2:\tOP\t'->'
|
|
2,0-2,0:\tENDMARKER\t''
|
|
"""
|
|
|
|
if __name__ == "__main__":
|
|
test_main()
|