From 0866b5579ad0cf7f4c33869d63a4d15494e9e07b Mon Sep 17 00:00:00 2001 From: Enrique Saurez Date: Wed, 3 Jun 2026 12:02:58 -0700 Subject: [PATCH] =?UTF-8?q?[nanvix]=20E:=20Phase=201C=20=E2=80=94=20build?= =?UTF-8?q?=208=20Tier-1=20text-codec=20modules=20as=20.so?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1C of the .a -> .so migration (see nanvix-todo/cpython-static-to-shared-migration.md section 5). Builds on Phase 1B (#6, #7) by promoting the remaining 8 Tier-1 "text codec" stdlib extension modules from statically linked into python.elf to dlopen-loaded shared objects under lib/python3.12/lib-dynload/. Modules moved to *shared* in Modules/Setup.local generation (.nanvix/docker.py): - unicodedata: Unicode database lookups (the big one — 1.2 MB of unicode data tables). - _multibytecodec: shared CJK codec infrastructure. - _codecs_cn / _codecs_hk / _codecs_iso2022 / _codecs_jp / _codecs_kr / _codecs_tw: per-region CJK codec tables. None of the eight reference external libraries; they are pure C with embedded data tables. They link against the same -lc / runtime symbols that the rest of the Phase 1 modules use. Test coverage (.nanvix/test.py): - New phase1c_snippet imports each module, asserts it is NOT in sys.builtin_module_names, exercises one trivial API call to confirm dlopen + PyInit_ succeeded (unicodedata.lookup, _multibytecodec.__create_codec, _codecs_.getcodec), and prints the resolved __file__ path. Phase 1A/1B probes retained. Validation on local toolchain (phase0-llfix): - All 8 new .so files produced and installed under lib-dynload/ (unicodedata 1193K, _codecs_jp 262K, _codecs_hk 168K, _codecs_cn 155K, _codecs_kr 145K, _multibytecodec 147K, _codecs_tw 115K, _codecs_iso2022 76K — total ~2.2 MB across the eight files). - nm python.elf no longer shows PyInit_ for any of the 8. - python.elf size: 19.18 MB (Phase 1B) -> 17.48 MB (Phase 1C), -1.70 MB. Biggest single-phase reduction so far because the CJK codec tables and the Unicode database are large. - Hello + Phase 1A + Phase 1B + Phase 1C import probes + lxml + HTTP smoke + full regrtest 160/160 PASS in standalone mode. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .nanvix/docker.py | 9 +++++++++ .nanvix/test.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/.nanvix/docker.py b/.nanvix/docker.py index d7dbc86747..92d8f54ddb 100644 --- a/.nanvix/docker.py +++ b/.nanvix/docker.py @@ -320,6 +320,15 @@ def _generate_setup_local_cmd() -> str: f"'_statistics _statisticsmodule.c' " f"'mmap mmapmodule.c' " f"'_contextvars _contextvarsmodule.c' " + f"'# Phase 1C: Tier-1 text codecs (pure C, no external deps).' " + f"'unicodedata unicodedata.c' " + f"'_multibytecodec cjkcodecs/multibytecodec.c' " + f"'_codecs_cn cjkcodecs/_codecs_cn.c' " + f"'_codecs_hk cjkcodecs/_codecs_hk.c' " + f"'_codecs_iso2022 cjkcodecs/_codecs_iso2022.c' " + f"'_codecs_jp cjkcodecs/_codecs_jp.c' " + f"'_codecs_kr cjkcodecs/_codecs_kr.c' " + f"'_codecs_tw cjkcodecs/_codecs_tw.c' " f"> {ws}/Modules/Setup.local" ) diff --git a/.nanvix/test.py b/.nanvix/test.py index 836a947b29..367687d0a9 100644 --- a/.nanvix/test.py +++ b/.nanvix/test.py @@ -488,6 +488,27 @@ def stage( " print(f'CPYTHON_TEST_PHASE1A: {_name} loaded via dlopen from " "{_mod.__file__}')\n" ) + # Phase 1C: Tier-1 text-codec modules (CJK + unicodedata). No + # external deps; resolve against python.elf's .dynsym via dlopen. + phase1c_snippet = ( + "_phase1c = [\n" + " ('unicodedata', lambda m: m.lookup('LATIN SMALL LETTER A') == 'a'),\n" + " ('_multibytecodec', lambda m: hasattr(m, '__create_codec')),\n" + " ('_codecs_cn', lambda m: hasattr(m, 'getcodec')),\n" + " ('_codecs_hk', lambda m: hasattr(m, 'getcodec')),\n" + " ('_codecs_iso2022', lambda m: hasattr(m, 'getcodec')),\n" + " ('_codecs_jp', lambda m: hasattr(m, 'getcodec')),\n" + " ('_codecs_kr', lambda m: hasattr(m, 'getcodec')),\n" + " ('_codecs_tw', lambda m: hasattr(m, 'getcodec')),\n" + "]\n" + "for _name, _check in _phase1c:\n" + " _mod = __import__(_name)\n" + " assert _name not in sys.builtin_module_names, " + "f'{_name} still built-in!'\n" + " assert _check(_mod), f'{_name} sanity check failed'\n" + " print(f'CPYTHON_TEST_PHASE1C: {_name} loaded via dlopen from " + "{_mod.__file__}')\n" + ) # Phase 1B: Tier-1 math + memory modules. Same dlopen flow; libm # symbols are pulled from python.elf via --whole-archive. phase1b_snippet = ( @@ -526,6 +547,7 @@ def stage( + array_snippet + phase1a_snippet + phase1b_snippet + + phase1c_snippet + (lxml_snippet if standalone else ""), )