gh-102856: Tokenize performance improvement (#104731)

2024-11-23 09:54:58 +08:00 · 2023-05-22 02:29:04 +02:00 · 2023-05-22 02:29:04 +02:00 · 8817886ae5
commit 8817886ae5
parent 4b107d86f3
2 changed files with 17 additions and 13 deletions
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -449,16 +449,6 @@ def _tokenize(rl_gen, encoding):
    source = b"".join(rl_gen).decode(encoding)
    token = None
    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
        # TODO: Marta -> limpiar esto
        if 6 < token.type <= 54:
            token = token._replace(type=OP)
        if token.type in {ASYNC, AWAIT}:
            token = token._replace(type=NAME)
        if token.type == NEWLINE:
            l_start, c_start = token.start
            l_end, c_end = token.end
            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
        yield token
    if token is not None:
        last_line, _ = token.start
@ -550,8 +540,7 @@ def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
    import _tokenize as c_tokenizer
    for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
-        tok, type, lineno, end_lineno, col_off, end_col_off, line = info
+        yield TokenInfo._make(info)
        yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
 if __name__ == "__main__":
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@ -207,7 +207,22 @@ tokenizeriter_next(tokenizeriterobject *it)
        end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
    }
-    result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+    if (it->tok->tok_extra_tokens) {
        // Necessary adjustments to match the original Python tokenize
        // implementation
        if (type > DEDENT && type < OP) {
            type = OP;
        }
        else if (type == ASYNC || type == AWAIT) {
            type = NAME;
        }
        else if (type == NEWLINE) {
            str = PyUnicode_FromString("\n");
            end_col_offset++;
        }
    }
    result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
 exit:
    _PyToken_Free(&token);
    return result;