mirror of
https://github.com/python/cpython.git
synced 2024-11-23 09:54:58 +08:00
gh-102856: Tokenize performance improvement (#104731)
This commit is contained in:
parent
4b107d86f3
commit
8817886ae5
@ -449,16 +449,6 @@ def _tokenize(rl_gen, encoding):
|
|||||||
source = b"".join(rl_gen).decode(encoding)
|
source = b"".join(rl_gen).decode(encoding)
|
||||||
token = None
|
token = None
|
||||||
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
|
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
|
||||||
# TODO: Marta -> limpiar esto
|
|
||||||
if 6 < token.type <= 54:
|
|
||||||
token = token._replace(type=OP)
|
|
||||||
if token.type in {ASYNC, AWAIT}:
|
|
||||||
token = token._replace(type=NAME)
|
|
||||||
if token.type == NEWLINE:
|
|
||||||
l_start, c_start = token.start
|
|
||||||
l_end, c_end = token.end
|
|
||||||
token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
|
|
||||||
|
|
||||||
yield token
|
yield token
|
||||||
if token is not None:
|
if token is not None:
|
||||||
last_line, _ = token.start
|
last_line, _ = token.start
|
||||||
@ -550,8 +540,7 @@ def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
|
|||||||
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
||||||
import _tokenize as c_tokenizer
|
import _tokenize as c_tokenizer
|
||||||
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
|
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
|
||||||
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
|
yield TokenInfo._make(info)
|
||||||
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -207,7 +207,22 @@ tokenizeriter_next(tokenizeriterobject *it)
|
|||||||
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
|
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
|
if (it->tok->tok_extra_tokens) {
|
||||||
|
// Necessary adjustments to match the original Python tokenize
|
||||||
|
// implementation
|
||||||
|
if (type > DEDENT && type < OP) {
|
||||||
|
type = OP;
|
||||||
|
}
|
||||||
|
else if (type == ASYNC || type == AWAIT) {
|
||||||
|
type = NAME;
|
||||||
|
}
|
||||||
|
else if (type == NEWLINE) {
|
||||||
|
str = PyUnicode_FromString("\n");
|
||||||
|
end_col_offset++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
|
||||||
exit:
|
exit:
|
||||||
_PyToken_Free(&token);
|
_PyToken_Free(&token);
|
||||||
return result;
|
return result;
|
||||||
|
Loading…
Reference in New Issue
Block a user