gh-126727: Fix locale.nl_langinfo(locale.ERA) (GH-126730)

It now returns multiple era description segments separated by semicolons. Previously it only returned the first segment on platforms with Glibc.
2024-11-23 01:45:25 +08:00 · 2024-11-21 13:16:08 +02:00 · 2024-11-21 13:16:08 +02:00 · 4803cd0244
commit 4803cd0244
parent eaf2171082
4 changed files with 95 additions and 28 deletions
--- a/Doc/library/locale.rst
+++ b/Doc/library/locale.rst
@ -281,7 +281,8 @@ The :mod:`locale` module defines the following exception and functions:

   .. data:: ERA

-      Get a string that represents the era used in the current locale.
+      Get a string which describes how years are counted and displayed for
+      each era in a locale.

      Most locales do not define this value.  An example of a locale which does
      define this value is the Japanese one.  In Japan, the traditional
@ -290,9 +291,10 @@ The :mod:`locale` module defines the following exception and functions:

      Normally it should not be necessary to use this value directly. Specifying
      the ``E`` modifier in their format strings causes the :func:`time.strftime`
-      function to use this information.  The format of the returned string is not
-      specified, and therefore you should not assume knowledge of it on different
-      systems.
+      function to use this information.
+      The format of the returned string is specified in *The Open Group Base
+      Specifications Issue 8*, paragraph `7.3.5.2 LC_TIME C-Language Access
+      <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap07.html#tag_07_03_05_02>`_.

   .. data:: ERA_D_T_FMT

--- a/Lib/test/test__locale.py
+++ b/Lib/test/test__locale.py
@ -90,6 +90,14 @@ known_alt_digits = {
    'bn_IN': (100, {0: '\u09e6', 10: '\u09e7\u09e6', 99: '\u09ef\u09ef'}),
 }

+known_era = {
+    'C': (0, ''),
+    'en_US': (0, ''),
+    'ja_JP': (11, '+:1:2019/05/01:2019/12/31:令和:%EC元年'),
+    'zh_TW': (3, '+:1:1912/01/01:1912/12/31:民國:%EC元年'),
+    'th_TW': (1, '+:1:-543/01/01:+*:พ.ศ.:%EC %Ey'),
+}
+
 if sys.platform == 'win32':
    # ps_AF doesn't work on Windows: see bpo-38324 (msg361830)
    del known_numerics['ps_AF']
@ -230,6 +238,43 @@ class _LocaleTests(unittest.TestCase):
        if not tested:
            self.skipTest('no suitable locales')

+    @unittest.skipUnless(nl_langinfo, "nl_langinfo is not available")
+    @unittest.skipUnless(hasattr(locale, 'ERA'), "requires locale.ERA")
+    @unittest.skipIf(
+        support.is_emscripten or support.is_wasi,
+        "musl libc issue on Emscripten, bpo-46390"
+    )
+    def test_era_nl_langinfo(self):
+        # Test nl_langinfo(ERA)
+        tested = False
+        for loc in candidate_locales:
+            with self.subTest(locale=loc):
+                try:
+                    setlocale(LC_TIME, loc)
+                except Error:
+                    self.skipTest(f'no locale {loc!r}')
+                    continue
+
+                with self.subTest(locale=loc):
+                    era = nl_langinfo(locale.ERA)
+                    self.assertIsInstance(era, str)
+                    if era:
+                        self.assertEqual(era.count(':'), (era.count(';') + 1) * 5, era)
+
+                    loc1 = loc.split('.', 1)[0]
+                    if loc1 in known_era:
+                        count, sample = known_era[loc1]
+                        if count:
+                            if not era:
+                                self.skipTest(f'ERA is not set for locale {loc!r} on this platform')
+                            self.assertGreaterEqual(era.count(';') + 1, count)
+                            self.assertIn(sample, era)
+                        else:
+                            self.assertEqual(era, '')
+                    tested = True
+        if not tested:
+            self.skipTest('no suitable locales')
+
    def test_float_parsing(self):
        # Bug #1391872: Test whether float parsing is okay on European
        # locales.
--- a/Misc/NEWS.d/next/Library/2024-11-12-13-14-47.gh-issue-126727.5Eqfqd.rst
+++ b/Misc/NEWS.d/next/Library/2024-11-12-13-14-47.gh-issue-126727.5Eqfqd.rst
@ -0,0 +1,3 @@
+``locale.nl_langinfo(locale.ERA)`` now returns multiple era description
+segments separated by semicolons. Previously it only returned the first
+segment on platforms with Glibc.
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@ -636,6 +636,37 @@ restore_locale(char *oldloc)
    }
 }

+#ifdef __GLIBC__
+#if defined(ALT_DIGITS) || defined(ERA)
+static PyObject *
+decode_strings(const char *result, size_t max_count)
+{
+    /* Convert a sequence of NUL-separated C strings to a Python string
+     * containing semicolon separated items. */
+    size_t i = 0;
+    size_t count = 0;
+    for (; count < max_count && result[i]; count++) {
+        i += strlen(result + i) + 1;
+    }
+    char *buf = PyMem_Malloc(i);
+    if (buf == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    memcpy(buf, result, i);
+    /* Replace all NULs with semicolons. */
+    i = 0;
+    while (--count) {
+        i += strlen(buf + i);
+        buf[i++] = ';';
+    }
+    PyObject *pyresult = PyUnicode_DecodeLocale(buf, NULL);
+    PyMem_Free(buf);
+    return pyresult;
+}
+#endif
+#endif
+
 /*[clinic input]
 _locale.nl_langinfo

@ -668,32 +699,18 @@ _locale_nl_langinfo_impl(PyObject *module, int item)
            }
            PyObject *pyresult;
 #ifdef __GLIBC__
+            /* According to the POSIX specification the result must be
+             * a sequence of semicolon-separated strings.
+             * But in Glibc they are NUL-separated. */
 #ifdef ALT_DIGITS
            if (item == ALT_DIGITS && *result) {
-                /* According to the POSIX specification the result must be
-                 * a sequence of up to 100 semicolon-separated strings.
-                 * But in Glibc they are NUL-separated. */
-                Py_ssize_t i = 0;
-                int count = 0;
-                for (; count < 100 && result[i]; count++) {
-                    i += strlen(result + i) + 1;
-                }
-                char *buf = PyMem_Malloc(i);
-                if (buf == NULL) {
-                    PyErr_NoMemory();
-                    pyresult = NULL;
-                }
-                else {
-                    memcpy(buf, result, i);
-                    /* Replace all NULs with semicolons. */
-                    i = 0;
-                    while (--count) {
-                        i += strlen(buf + i);
-                        buf[i++] = ';';
-                    }
-                    pyresult = PyUnicode_DecodeLocale(buf, NULL);
-                    PyMem_Free(buf);
-                }
+                pyresult = decode_strings(result, 100);
+            }
+            else
+#endif
+#ifdef ERA
+            if (item == ERA && *result) {
+                pyresult = decode_strings(result, SIZE_MAX);
            }
            else
 #endif