gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (#105070)

2024-12-01 05:45:40 +08:00 · 2023-05-30 22:43:34 +01:00 · 2023-05-30 22:43:34 +01:00 · 9216e69a87
commit 9216e69a87
parent 2ea34cfb3a
7 changed files with 276 additions and 98 deletions
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@ -2203,7 +2203,7 @@ def _signature_strip_non_python_syntax(signature):
        add(string)
        if (string == ','):
            add(' ')
-    clean_signature = ''.join(text).strip()
+    clean_signature = ''.join(text).strip().replace("\n", "")
    return clean_signature, self_parameter


--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -1,6 +1,6 @@
 from test import support
 from test.support import os_helper
-from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
+from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                     open as tokenize_open, Untokenizer, generate_tokens,
                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
@ -51,6 +51,25 @@ class TokenizeTest(TestCase):
                         ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                         expected.rstrip().splitlines())

+    def test_invalid_readline(self):
+        def gen():
+            yield "sdfosdg"
+            yield "sdfosdg"
+        with self.assertRaises(TypeError):
+            list(tokenize(gen().__next__))
+
+        def gen():
+            yield b"sdfosdg"
+            yield b"sdfosdg"
+        with self.assertRaises(TypeError):
+            list(generate_tokens(gen().__next__))
+
+        def gen():
+            yield "sdfosdg"
+            1/0
+        with self.assertRaises(ZeroDivisionError):
+            list(generate_tokens(gen().__next__))
+
    def test_implicit_newline(self):
        # Make sure that the tokenizer puts in an implicit NEWLINE
        # when the input lacks a trailing new line.
@ -1154,7 +1173,8 @@ class TestTokenizerAdheresToPep0263(TestCase):

    def _testFile(self, filename):
        path = os.path.join(os.path.dirname(__file__), filename)
-        TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
+        with open(path, 'rb') as f:
+            TestRoundtrip.check_roundtrip(self, f)

    def test_utf8_coding_cookie_and_no_utf8_bom(self):
        f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
@ -1199,7 +1219,8 @@ class Test_Tokenize(TestCase):
                yield b''

        # skip the initial encoding token and the end tokens
-        tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
+        tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
+                      extra_tokens=True))[:-2]
        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
        self.assertEqual(tokens, expected_tokens,
                         "bytes not decoded with encoding")
@ -1468,13 +1489,13 @@ class TestTokenize(TestCase):
        def mock_detect_encoding(readline):
            return encoding, [b'first', b'second']

-        def mock__tokenize(readline, encoding):
+        def mock__tokenize(readline, encoding, **kwargs):
            nonlocal encoding_used
            encoding_used = encoding
            out = []
            while True:
                try:
-                    next_line = next(readline)
+                    next_line = readline()
                except StopIteration:
                    return out
                if next_line:
@ -1491,16 +1512,16 @@ class TestTokenize(TestCase):
            return str(counter).encode()

        orig_detect_encoding = tokenize_module.detect_encoding
-        orig__tokenize = tokenize_module._tokenize
+        orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer
        tokenize_module.detect_encoding = mock_detect_encoding
-        tokenize_module._tokenize = mock__tokenize
+        tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize
        try:
            results = tokenize(mock_readline)
            self.assertEqual(list(results)[1:],
                             [b'first', b'second', b'1', b'2', b'3', b'4'])
        finally:
            tokenize_module.detect_encoding = orig_detect_encoding
-            tokenize_module._tokenize = orig__tokenize
+            tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token

        self.assertEqual(encoding_used, encoding)

@ -1827,12 +1848,33 @@ class CTokenizeTest(TestCase):
    def check_tokenize(self, s, expected):
        # Format the tokens in s in a table format.
        # The ENDMARKER and final NEWLINE are omitted.
+        f = StringIO(s)
        with self.subTest(source=s):
            result = stringify_tokens_from_source(
-                _generate_tokens_from_c_tokenizer(s), s
+                _generate_tokens_from_c_tokenizer(f.readline), s
            )
            self.assertEqual(result, expected.rstrip().splitlines())

+    def test_encoding(self):
+        def readline(encoding):
+            yield "1+1".encode(encoding)
+
+        expected = [
+            TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
+            TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
+            TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
+            TokenInfo(type=NEWLINE, string='\n', start=(1, 3), end=(1, 4), line='1+1\n'),
+            TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
+        ]
+        for encoding in ["utf-8", "latin-1", "utf-16"]:
+            with self.subTest(encoding=encoding):
+                tokens = list(_generate_tokens_from_c_tokenizer(
+                    readline(encoding).__next__,
+                    extra_tokens=True,
+                    encoding=encoding,
+                ))
+                self.assertEqual(tokens, expected)
+
    def test_int(self):

        self.check_tokenize('0xff <= 255', """\
@ -2668,43 +2710,44 @@ async def f():

    def test_invalid_syntax(self):
        def get_tokens(string):
-            return list(_generate_tokens_from_c_tokenizer(string))
+            the_string = StringIO(string)
+            return list(_generate_tokens_from_c_tokenizer(the_string.readline))

-        self.assertRaises(SyntaxError, get_tokens, "(1+2]")
-        self.assertRaises(SyntaxError, get_tokens, "(1+2}")
-        self.assertRaises(SyntaxError, get_tokens, "{1+2]")
+        for case in [
+            "(1+2]",
+            "(1+2}",
+            "{1+2]",
+            "1_",
+            "1.2_",
+            "1e2_",
+            "1e+",

-        self.assertRaises(SyntaxError, get_tokens, "1_")
-        self.assertRaises(SyntaxError, get_tokens, "1.2_")
-        self.assertRaises(SyntaxError, get_tokens, "1e2_")
-        self.assertRaises(SyntaxError, get_tokens, "1e+")
-
-        self.assertRaises(SyntaxError, get_tokens, "\xa0")
-        self.assertRaises(SyntaxError, get_tokens, "€")
-
-        self.assertRaises(SyntaxError, get_tokens, "0b12")
-        self.assertRaises(SyntaxError, get_tokens, "0b1_2")
-        self.assertRaises(SyntaxError, get_tokens, "0b2")
-        self.assertRaises(SyntaxError, get_tokens, "0b1_")
-        self.assertRaises(SyntaxError, get_tokens, "0b")
-        self.assertRaises(SyntaxError, get_tokens, "0o18")
-        self.assertRaises(SyntaxError, get_tokens, "0o1_8")
-        self.assertRaises(SyntaxError, get_tokens, "0o8")
-        self.assertRaises(SyntaxError, get_tokens, "0o1_")
-        self.assertRaises(SyntaxError, get_tokens, "0o")
-        self.assertRaises(SyntaxError, get_tokens, "0x1_")
-        self.assertRaises(SyntaxError, get_tokens, "0x")
-        self.assertRaises(SyntaxError, get_tokens, "1_")
-        self.assertRaises(SyntaxError, get_tokens, "012")
-        self.assertRaises(SyntaxError, get_tokens, "1.2_")
-        self.assertRaises(SyntaxError, get_tokens, "1e2_")
-        self.assertRaises(SyntaxError, get_tokens, "1e+")
-
-        self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
-        self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
-
-        self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
-        self.assertRaises(SyntaxError, get_tokens, "]")
+            "\xa0",
+            "€",
+            "0b12",
+            "0b1_2",
+            "0b2",
+            "0b1_",
+            "0b",
+            "0o18",
+            "0o1_8",
+            "0o8",
+            "0o1_",
+            "0o",
+            "0x1_",
+            "0x",
+            "1_",
+            "012",
+            "1.2_",
+            "1e2_",
+            "1e+",
+            "'sdfsdf",
+            "'''sdfsdf''",
+            "("*1000+"a"+")"*1000,
+            "]",
+        ]:
+            with self.subTest(case=case):
+                self.assertRaises(SyntaxError, get_tokens, case)

    def test_max_indent(self):
        MAXINDENT = 100
@ -2715,20 +2758,24 @@ async def f():
            return source

        valid = generate_source(MAXINDENT - 1)
-        tokens = list(_generate_tokens_from_c_tokenizer(valid))
+        the_input = StringIO(valid)
+        tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
        self.assertEqual(tokens[-2].type, DEDENT)
        self.assertEqual(tokens[-1].type, ENDMARKER)
        compile(valid, "<string>", "exec")

        invalid = generate_source(MAXINDENT)
-        self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
+        the_input = StringIO(invalid)
+        self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
        self.assertRaises(
            IndentationError, compile, invalid, "<string>", "exec"
        )

    def test_continuation_lines_indentation(self):
        def get_tokens(string):
-            return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
+            the_string = StringIO(string)
+            return [(kind, string) for (kind, string, *_)
+                    in _generate_tokens_from_c_tokenizer(the_string.readline)]

        code = dedent("""
            def fib(n):
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -34,6 +34,7 @@ import re
 import sys
 from token import *
 from token import EXACT_TOKEN_TYPES
+import _tokenize

 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@ -443,12 +444,7 @@ def tokenize(readline):
            # BOM will already have been stripped.
            encoding = "utf-8"
        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    yield from _tokenize(rl_gen, encoding)
-
-def _tokenize(rl_gen, encoding):
-    source = b"".join(rl_gen).decode(encoding)
-    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
-        yield token
+    yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)

 def generate_tokens(readline):
    """Tokenize a source reading Python code as unicode strings.
@ -456,16 +452,7 @@ def generate_tokens(readline):
    This has the same API as tokenize(), except that it expects the *readline*
    callable to return str objects instead of bytes.
    """
-    def _gen():
-        while True:
-            try:
-                line = readline()
-            except StopIteration:
-                return
-            if not line:
-                return
-            yield line.encode()
-    return _tokenize(_gen(), 'utf-8')
+    return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)

 def main():
    import argparse
@ -502,9 +489,9 @@ def main():
                tokens = list(tokenize(f.readline))
        else:
            filename = "<stdin>"
-            tokens = _tokenize(
+            tokens = _generate_tokens_from_c_tokenizer(
                (x.encode('utf-8') for x in iter(sys.stdin.readline, "")
-            ), "utf-8")
+            ), "utf-8", extra_tokens=True)


        # Output the tokenization
@ -531,10 +518,13 @@ def main():
        perror("unexpected error: %s" % err)
        raise

-def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
+def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
-    import _tokenize as c_tokenizer
-    for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
+    if encoding is None:
+        it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
+    else:
+        it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
+    for info in it:
        yield TokenInfo._make(info)


--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -103,6 +103,7 @@ tok_new(void)
    tok->filename = NULL;
    tok->decoding_readline = NULL;
    tok->decoding_buffer = NULL;
+    tok->readline = NULL;
    tok->type_comments = 0;
    tok->async_hacks = 0;
    tok->async_def = 0;
@ -139,8 +140,9 @@ static char *
 error_ret(struct tok_state *tok) /* XXX */
 {
    tok->decoding_erred = 1;
-    if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
+    if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
        PyMem_Free(tok->buf);
+    }
    tok->buf = tok->cur = tok->inp = NULL;
    tok->start = NULL;
    tok->end = NULL;
@ -900,6 +902,33 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
    return tok;
 }

+struct tok_state *
+_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
+                          int exec_input, int preserve_crlf)
+{
+    struct tok_state *tok = tok_new();
+    if (tok == NULL)
+        return NULL;
+    if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
+        _PyTokenizer_Free(tok);
+        return NULL;
+    }
+    tok->cur = tok->inp = tok->buf;
+    tok->end = tok->buf + BUFSIZ;
+    tok->fp = NULL;
+    if (enc != NULL) {
+        tok->encoding = new_string(enc, strlen(enc), tok);
+        if (!tok->encoding) {
+            _PyTokenizer_Free(tok);
+            return NULL;
+        }
+    }
+    tok->decoding_state = STATE_NORMAL;
+    Py_INCREF(readline);
+    tok->readline = readline;
+    return tok;
+}
+
 /* Set up tokenizer for UTF-8 string */

 struct tok_state *
@ -969,8 +998,9 @@ _PyTokenizer_Free(struct tok_state *tok)
    }
    Py_XDECREF(tok->decoding_readline);
    Py_XDECREF(tok->decoding_buffer);
+    Py_XDECREF(tok->readline);
    Py_XDECREF(tok->filename);
-    if (tok->fp != NULL && tok->buf != NULL) {
+    if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
        PyMem_Free(tok->buf);
    }
    if (tok->input) {
@ -1021,6 +1051,71 @@ tok_readline_raw(struct tok_state *tok)
    return 1;
 }

+static int
+tok_readline_string(struct tok_state* tok) {
+    PyObject* line = NULL;
+    PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
+    if (raw_line == NULL) {
+        if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+            PyErr_Clear();
+            return 1;
+        }
+        error_ret(tok);
+        goto error;
+    }
+    if(tok->encoding != NULL) {
+        if (!PyBytes_Check(raw_line)) {
+            PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
+            error_ret(tok);
+            goto error;
+        }
+        line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
+                                tok->encoding, "replace");
+        Py_CLEAR(raw_line);
+        if (line == NULL) {
+            error_ret(tok);
+            goto error;
+        }
+    } else {
+        if(!PyUnicode_Check(raw_line)) {
+            PyErr_Format(PyExc_TypeError, "readline() returned a non-string object");
+            error_ret(tok);
+            goto error;
+        }
+        line = raw_line;
+        raw_line = NULL;
+    }
+    Py_ssize_t buflen;
+    const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
+    if (buf == NULL) {
+        error_ret(tok);
+        goto error;
+    }
+
+    // Make room for the null terminator *and* potentially
+    // an extra newline character that we may need to artificially
+    // add.
+    size_t buffer_size = buflen + 2;
+    if (!tok_reserve_buf(tok, buffer_size)) {
+        goto error;
+    }
+    memcpy(tok->inp, buf, buflen);
+    tok->inp += buflen;
+    *tok->inp = '\0';
+
+    if (tok->start == NULL) {
+        tok->buf = tok->cur;
+    }
+    tok->line_start = tok->cur;
+
+    Py_DECREF(line);
+    return 1;
+error:
+    Py_XDECREF(raw_line);
+    Py_XDECREF(line);
+    return 0;
+}
+
 static int
 tok_underflow_string(struct tok_state *tok) {
    char *end = strchr(tok->inp, '\n');
@ -1195,6 +1290,38 @@ tok_underflow_file(struct tok_state *tok) {
    return tok->done == E_OK;
 }

+static int
+tok_underflow_readline(struct tok_state* tok) {
+    assert(tok->decoding_state == STATE_NORMAL);
+    assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL);
+    if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
+        tok->cur = tok->inp = tok->buf;
+    }
+    if (!tok_readline_string(tok)) {
+        return 0;
+    }
+    if (tok->inp == tok->cur) {
+        tok->done = E_EOF;
+        return 0;
+    }
+    if (tok->inp[-1] != '\n') {
+        assert(tok->inp + 1 < tok->end);
+        /* Last line does not end in \n, fake one */
+        *tok->inp++ = '\n';
+        *tok->inp = '\0';
+    }
+
+    ADVANCE_LINENO();
+    /* The default encoding is UTF-8, so make sure we don't have any
+       non-UTF-8 sequences in it. */
+    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
+        error_ret(tok);
+        return 0;
+    }
+    assert(tok->done == E_OK);
+    return tok->done == E_OK;
+}
+
 #if defined(Py_DEBUG)
 static void
 print_escape(FILE *f, const char *s, Py_ssize_t size)
@ -1238,7 +1365,10 @@ tok_nextc(struct tok_state *tok)
        if (tok->done != E_OK) {
            return EOF;
        }
-        if (tok->fp == NULL) {
+        if (tok->readline) {
+            rc = tok_underflow_readline(tok);
+        }
+        else if (tok->fp == NULL) {
            rc = tok_underflow_string(tok);
        }
        else if (tok->prompt != NULL) {
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@ -109,6 +109,7 @@ struct tok_state {
                                     expression (cf. issue 16806) */
    PyObject *decoding_readline; /* open(...).readline */
    PyObject *decoding_buffer;
+    PyObject *readline;     /* readline() function */
    const char* enc;        /* Encoding for the current str. */
    char* str;          /* Source string being tokenized (if tokenizing from a string)*/
    char* input;       /* Tokenizer's newline translated copy of the string. */
@ -137,6 +138,7 @@ struct tok_state {

 extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
 extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
+extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
 extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                              const char *, const char *);
 extern void _PyTokenizer_Free(struct tok_state *);
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@ -37,15 +37,17 @@ typedef struct
@classmethod
 _tokenizer.tokenizeriter.__new__ as tokenizeriter_new

-    source: str
+    readline: object
+    /
    *
    extra_tokens: bool
+    encoding: str(c_default="NULL") = 'utf-8'
 [clinic start generated code]*/

 static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source,
-                       int extra_tokens)
-/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
+tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
+                       int extra_tokens, const char *encoding)
+/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
 {
    tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
    if (self == NULL) {
@ -55,7 +57,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
    if (filename == NULL) {
        return NULL;
    }
-    self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
+    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
    if (self->tok == NULL) {
        Py_DECREF(filename);
        return NULL;
--- a/Python/clinic/Python-tokenize.c.h
+++ b/Python/clinic/Python-tokenize.c.h
@ -9,8 +9,8 @@ preserve


 static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source,
-                       int extra_tokens);
+tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
+                       int extra_tokens, const char *encoding);

 static PyObject *
 tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
@ -25,7 +25,7 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
        PyObject *ob_item[NUM_KEYWORDS];
    } _kwtuple = {
        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
-        .ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), },
+        .ob_item = { &_Py_ID(extra_tokens), &_Py_ID(encoding), },
    };
    #undef NUM_KEYWORDS
    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@ -34,43 +34,50 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
    #  define KWTUPLE NULL
    #endif  // !Py_BUILD_CORE

-    static const char * const _keywords[] = {"source", "extra_tokens", NULL};
+    static const char * const _keywords[] = {"", "extra_tokens", "encoding", NULL};
    static _PyArg_Parser _parser = {
        .keywords = _keywords,
        .fname = "tokenizeriter",
        .kwtuple = KWTUPLE,
    };
    #undef KWTUPLE
-    PyObject *argsbuf[2];
+    PyObject *argsbuf[3];
    PyObject * const *fastargs;
    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
-    const char *source;
+    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 2;
+    PyObject *readline;
    int extra_tokens;
+    const char *encoding = NULL;

    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
    if (!fastargs) {
        goto exit;
    }
-    if (!PyUnicode_Check(fastargs[0])) {
-        _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]);
-        goto exit;
-    }
-    Py_ssize_t source_length;
-    source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length);
-    if (source == NULL) {
-        goto exit;
-    }
-    if (strlen(source) != (size_t)source_length) {
-        PyErr_SetString(PyExc_ValueError, "embedded null character");
-        goto exit;
-    }
+    readline = fastargs[0];
    extra_tokens = PyObject_IsTrue(fastargs[1]);
    if (extra_tokens < 0) {
        goto exit;
    }
-    return_value = tokenizeriter_new_impl(type, source, extra_tokens);
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    if (!PyUnicode_Check(fastargs[2])) {
+        _PyArg_BadArgument("tokenizeriter", "argument 'encoding'", "str", fastargs[2]);
+        goto exit;
+    }
+    Py_ssize_t encoding_length;
+    encoding = PyUnicode_AsUTF8AndSize(fastargs[2], &encoding_length);
+    if (encoding == NULL) {
+        goto exit;
+    }
+    if (strlen(encoding) != (size_t)encoding_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = tokenizeriter_new_impl(type, readline, extra_tokens, encoding);

 exit:
    return return_value;
 }
-/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=48be65a2808bdfa6 input=a9049054013a1b77]*/