PyUnicode_Ready() now sets ascii=1 if maxchar < 128

ascii=1 is no more reserved to PyASCIIObject. Use PyUnicode_IS_COMPACT_ASCII(obj) to check if obj is a PyASCIIObject (as before).
2024-11-27 20:04:41 +08:00 · 2011-10-03 13:53:37 +02:00 · 2011-10-03 13:53:37 +02:00 · a3b334da6d
commit a3b334da6d
parent 1b4f9ceca7
3 changed files with 42 additions and 33 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -224,7 +224,7 @@ typedef struct {
           PyUnicode_4BYTE_KIND
         * compact = 1
         * ready = 1
-         * (ascii = 0)
+         * ascii = 0

       - string created by the legacy API (not ready):

@ -236,7 +236,7 @@ typedef struct {
         * data.any is NULL
         * utf8 is NULL
         * interned = SSTATE_NOT_INTERNED
-         * (ascii = 0)
+         * ascii = 0

       - string created by the legacy API, ready:

@ -246,7 +246,6 @@ typedef struct {
         * compact = 0
         * ready = 1
         * data.any is not NULL
-         * (ascii = 0)

       String created by the legacy API becomes ready when calling
       PyUnicode_READY().
@ -278,8 +277,9 @@ typedef struct {
           one block for the PyUnicodeObject struct and another for its data
           buffer. */
        unsigned int compact:1;
-        /* Compact objects which are ASCII-only also have the state.compact
-           flag set, and use the PyASCIIObject struct. */
+        /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
+           characters. If ascii is 1 and compact is 1, use the PyASCIIObject
+           structure. */
        unsigned int ascii:1;
        /* The ready flag indicates whether the object layout is initialized
           completely. This means that this is either a compact object, or
@ -304,7 +304,7 @@ typedef struct {

 /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
   PyUnicodeObject structure. The actual string data is initially in the wstr
-   block, and copied into the data block using PyUnicode_Ready. */
+   block, and copied into the data block using _PyUnicode_Ready. */
 typedef struct {
    PyCompactUnicodeObject _base;
    union {
@ -327,7 +327,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
 #ifndef Py_LIMITED_API

 #define PyUnicode_WSTR_LENGTH(op) \
-    (((PyASCIIObject*)op)->state.ascii ?    \
+    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
     ((PyASCIIObject*)op)->length :                    \
     ((PyCompactUnicodeObject*)op)->wstr_length)

@ -369,10 +369,24 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
 #define SSTATE_INTERNED_MORTAL 1
 #define SSTATE_INTERNED_IMMORTAL 2

-#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
+/* Return true if the string contains only ASCII characters, or 0 if not. The
+   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
+   or Ready calls are performed. */
+#define PyUnicode_IS_ASCII(op)                 \
+    (((PyASCIIObject*)op)->state.ascii)
+
+/* Return true if the string is compact or 0 if not.
+   No type checks or Ready calls are performed. */
+#define PyUnicode_IS_COMPACT(op) \
+    (((PyASCIIObject*)(op))->state.compact)
+
+/* Return true if the string is a compact ASCII string (use PyASCIIObject
+   structure), or 0 if not.  No type checks or Ready calls are performed. */
+#define PyUnicode_IS_COMPACT_ASCII(op)                 \
+    (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))

 /* String contains only wstr byte characters.  This is only possible
-   when the string was created with a legacy API and PyUnicode_Ready()
+   when the string was created with a legacy API and _PyUnicode_Ready()
   has not been called yet.  */
 #define PyUnicode_WCHAR_KIND 0

@ -399,11 +413,6 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
 #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
 #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))

-/* Return true if the string is compact or 0 if not.
-   No type checks or Ready calls are performed. */
-#define PyUnicode_IS_COMPACT(op) \
-    (((PyASCIIObject*)(op))->state.compact)
-
 /* Return one of the PyUnicode_*_KIND values defined above. */
 #define PyUnicode_KIND(op) \
    (assert(PyUnicode_Check(op)), \
@ -500,9 +509,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;

 #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)

-/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
+/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
   case.  If the canonical representation is not yet set, it will still call
-   PyUnicode_Ready().
+   _PyUnicode_Ready().
   Returns 0 on success and -1 on errors. */
 #define PyUnicode_READY(op)                        \
    (assert(PyUnicode_Check(op)),                       \
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -288,16 +288,14 @@ _PyUnicode_CheckConsistency(void *op)
    ascii = (PyASCIIObject *)op;
    kind = ascii->state.kind;

-    if (ascii->state.ascii == 1) {
+    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
        assert(kind == PyUnicode_1BYTE_KIND);
-        assert(ascii->state.compact == 1);
        assert(ascii->state.ready == 1);
    }
    else if (ascii->state.compact == 1) {
        assert(kind == PyUnicode_1BYTE_KIND
               || kind == PyUnicode_2BYTE_KIND
               || kind == PyUnicode_4BYTE_KIND);
-        assert(ascii->state.compact == 1);
        assert(ascii->state.ascii == 0);
        assert(ascii->state.ready == 1);
    } else {
@ -305,9 +303,9 @@ _PyUnicode_CheckConsistency(void *op)
        PyUnicodeObject *unicode = (PyUnicodeObject *)op;

        if (kind == PyUnicode_WCHAR_KIND) {
-            assert(!ascii->state.compact == 1);
+            assert(ascii->state.compact == 0);
            assert(ascii->state.ascii == 0);
-            assert(!ascii->state.ready == 1);
+            assert(ascii->state.ready == 0);
            assert(ascii->wstr != NULL);
            assert(unicode->data.any == NULL);
            assert(compact->utf8 == NULL);
@ -317,10 +315,9 @@ _PyUnicode_CheckConsistency(void *op)
            assert(kind == PyUnicode_1BYTE_KIND
                   || kind == PyUnicode_2BYTE_KIND
                   || kind == PyUnicode_4BYTE_KIND);
-            assert(!ascii->state.compact == 1);
+            assert(ascii->state.compact == 0);
            assert(ascii->state.ready == 1);
            assert(unicode->data.any != NULL);
-            assert(ascii->state.ascii == 0);
        }
    }
    return 1;
@ -638,7 +635,7 @@ unicode_kind_name(PyObject *unicode)
        switch(PyUnicode_KIND(unicode))
        {
        case PyUnicode_1BYTE_KIND:
-            if (PyUnicode_IS_COMPACT_ASCII(unicode))
+            if (PyUnicode_IS_ASCII(unicode))
                return "legacy ascii";
            else
                return "legacy latin1";
@ -654,14 +651,14 @@ unicode_kind_name(PyObject *unicode)
    switch(PyUnicode_KIND(unicode))
    {
    case PyUnicode_1BYTE_KIND:
-        if (PyUnicode_IS_COMPACT_ASCII(unicode))
+        if (PyUnicode_IS_ASCII(unicode))
            return "ascii";
        else
-            return "compact latin1";
+            return "latin1";
    case PyUnicode_2BYTE_KIND:
-        return "compact UCS2";
+        return "UCS2";
    case PyUnicode_4BYTE_KIND:
-        return "compact UCS4";
+        return "UCS4";
    default:
        return "<invalid compact kind>";
    }
@ -703,7 +700,7 @@ _PyUnicode_Dump(PyObject *op)
    if (ascii->wstr == data)
        printf("shared ");
    printf("wstr=%p", ascii->wstr);
-    if (!ascii->state.ascii) {
+    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
        printf(" (%zu), ", compact->wstr_length);
        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
            printf("shared ");
@ -954,9 +951,9 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
        /* check if max_char(from substring) <= max_char(to) */
        if (from_kind > to_kind
                /* latin1 => ascii */
-            || (PyUnicode_IS_COMPACT_ASCII(to)
+            || (PyUnicode_IS_ASCII(to)
                && to_kind == PyUnicode_1BYTE_KIND
-                && !PyUnicode_IS_COMPACT_ASCII(from)))
+                && !PyUnicode_IS_ASCII(from)))
        {
            /* slow path to check for character overflow */
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
@ -1115,10 +1112,12 @@ unicode_ready(PyObject **p_obj, int replace)
        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
        if (maxchar < 128) {
+            _PyUnicode_STATE(unicode).ascii = 1;
            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
        }
        else {
+            _PyUnicode_STATE(unicode).ascii = 0;
            _PyUnicode_UTF8(unicode) = NULL;
            _PyUnicode_UTF8_LENGTH(unicode) = 0;
        }
--- a/Tools/gdb/libpython.py
+++ b/Tools/gdb/libpython.py
@ -1132,15 +1132,16 @@ class PyUnicodeObjectPtr(PyObjectPtr):
            compact = self.field('_base')
            ascii = compact['_base']
            state = ascii['state']
+            is_compact_ascii = (int(state['ascii']) and int(state['compact']))
            field_length = long(ascii['length'])
            if not int(state['ready']):
                # string is not ready
                may_have_surrogates = True
                field_str = ascii['wstr']
-                if not int(state['ascii']):
+                if not is_compact_ascii:
                    field_length = compact('wstr_length')
            else:
-                if int(state['ascii']):
+                if is_compact_ascii:
                    field_str = ascii.address + 1
                elif int(state['compact']):
                    field_str = compact.address + 1