diff --git a/.nanvix/config.py b/.nanvix/config.py index 36b62c5df199511..fb4d1eaf02ed41e 100644 --- a/.nanvix/config.py +++ b/.nanvix/config.py @@ -72,11 +72,20 @@ def toolchain_paths( "libssl": sr / "lib" / "libssl.a", "libcrypto": sr / "lib" / "libcrypto.a", "liblzma": sr / "lib" / "liblzma.a", + "libnvx_crt0": sr / "lib" / "libnvx_crt0.a", } def configure_env(toolchain: str | Path, sysroot: str | Path) -> dict[str, str]: - """Return the environment dict for ./configure.""" + """Return the environment dict for ./configure. + + NOTE: This helper is currently unused; the actual cpython build invokes + ``make -f Makefile.nanvix`` which has its own inline CONFIGURE_ENV. This + function is kept in sync so a future caller does not pick up stale link + flags. See ``Makefile.nanvix`` for the authoritative comment block + explaining the ``--whole-archive`` / ``--export-dynamic`` / + ``--allow-multiple-definition`` rationale. + """ tp = toolchain_paths(toolchain, sysroot) sr = Path(sysroot) return { @@ -96,7 +105,10 @@ def configure_env(toolchain: str | Path, sysroot: str | Path) -> dict[str, str]: f"-Wl,--export-dynamic -Wl,--no-dynamic-linker" ), "LIBS": ( - f"-Wl,--start-group {tp['libposix']} {tp['libc']} {tp['libm']} " + f"-Wl,--whole-archive {tp['libnvx_crt0']} {tp['libposix']} " + f"{tp['libc']} {tp['libm']} " + f"-lstdc++ -lgcc -Wl,--no-whole-archive " + f"-Wl,--start-group " f"-lsqlite3 -lssl -lcrypto -lz -lbz2 -llzma -lffi -Wl,--end-group" ), "LIBSQLITE3_LIBS": f"-L{sr}/lib -lsqlite3", diff --git a/.nanvix/docker.py b/.nanvix/docker.py index 16fa8e201a281b3..870580c99a72315 100644 --- a/.nanvix/docker.py +++ b/.nanvix/docker.py @@ -288,13 +288,19 @@ def _generate_setup_local_cmd() -> str: return ( f"printf '%s\\n' " f"'# Auto-generated by .nanvix/docker.py -- do not edit manually.' " + f"'' " f"'# Statically-linked extension modules for Nanvix builds.' " - f"'#' " + f"'*static*' " f"'# Nanvix OS interface module (snapshot, host-mount).' " f"'_nanvix _nanvixmodule.c' " f"'# lxml C extension modules (statically linked via pre-built archives).' " f"'_lxml_etree lxml_etree_builtin.c -L{sysroot}/lib -llxml_etree -lxslt -lexslt -lxml2 -lz' " f"'_lxml_elementpath lxml_elementpath_builtin.c -L{sysroot}/lib -llxml_elementpath -lxml2 -lz' " + f"'' " + f"'# Phase 0 of the .a -> .so migration: array as proof-of-concept shared module.' " + f"'# See nanvix-todo/cpython-static-to-shared-migration.md section 4.' " + f"'*shared*' " + f"'array arraymodule.c' " f"> {ws}/Modules/Setup.local" ) diff --git a/.nanvix/docker/Dockerfile b/.nanvix/docker/Dockerfile index 8edab3fd54399fa..9b605dc1d090b55 100644 --- a/.nanvix/docker/Dockerfile +++ b/.nanvix/docker/Dockerfile @@ -6,9 +6,57 @@ FROM ghcr.io/nanvix/toolchain-gcc:sha-34a3641 +# Install the host Python plus the build helpers required by extension +# modules that ship a meson or Cython build step (numpy, scipy, pandas, +# ...): +# +# - ninja — meson's default backend; missing it makes every meson-based +# extension build fail immediately. +# - Cython — required by numpy 1.26.4's `numpy/_build_utils/tempita.py` +# .pyx.in code generation. Pinned `<3` for numpy 1.26.x +# compatibility; lift the pin when bumping numpy. +# +# We deliberately purge `/usr/include/python3.12` after the install. The +# `python3-pip` / `ninja-build` apt packages transitively pull in +# `libpython3.12-dev`, whose headers under `/usr/include/python3.12` would +# otherwise be picked up by meson's regen step ahead of the Nanvix cross +# sysroot headers and silently corrupt the cross-build. RUN apt-get update \ && apt-get install -y --no-install-recommends \ python3 \ python3-dev \ + python3-pip \ + ninja-build \ + && pip3 install --break-system-packages --no-cache-dir 'Cython<3' \ + && rm -rf /usr/include/python3.12 \ && rm -rf /var/lib/apt/lists/* \ && ln -sf /usr/bin/python3 /opt/nanvix/bin/python3 + +# Install the cc-wrapper. This wrapper sits in front of the real +# `i686-nanvix-gcc` / `i686-nanvix-g++` driver binaries and detects +# whether the invocation is producing an executable or a shared library. +# For shared-library links (-shared) it strips exe-only LDFLAGS that +# would otherwise be inherited from cpython's single `LDFLAGS` env var +# and cause `.so` builds to fail when the linker treats the output as +# an executable. See cc-wrapper.sh's header for the full rationale. +# +# Install pattern: rename the real driver to `.real`, install the +# wrapper script at ``, and symlink for both gcc and g++. +COPY cc-wrapper.sh /opt/nanvix/bin/i686-nanvix-cc-wrapper.sh +RUN sed -i 's/\r$//' /opt/nanvix/bin/i686-nanvix-cc-wrapper.sh \ + && chmod +x /opt/nanvix/bin/i686-nanvix-cc-wrapper.sh \ + && for tool in i686-nanvix-gcc i686-nanvix-g++; do \ + if [ -L "/opt/nanvix/bin/$tool" ]; then \ + # Pre-existing wrapper symlink: require the matching .real to + # already exist (set up by a prior wrapper install) before we + # replace the symlink, so we never strand the toolchain. + if [ ! -f "/opt/nanvix/bin/$tool.real" ]; then \ + echo "cc-wrapper install: $tool is a symlink but $tool.real is missing; aborting" >&2; \ + exit 1; \ + fi; \ + rm /opt/nanvix/bin/$tool; \ + elif [ ! -f "/opt/nanvix/bin/$tool.real" ]; then \ + mv /opt/nanvix/bin/$tool /opt/nanvix/bin/$tool.real; \ + fi; \ + ln -sf i686-nanvix-cc-wrapper.sh /opt/nanvix/bin/$tool; \ + done diff --git a/.nanvix/docker/cc-wrapper.sh b/.nanvix/docker/cc-wrapper.sh new file mode 100644 index 000000000000000..a285186d7c415a2 --- /dev/null +++ b/.nanvix/docker/cc-wrapper.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# i686-nanvix-gcc / i686-nanvix-g++ cc-wrapper. +# +# Detects whether the invocation is producing an executable or a shared +# library, and routes to the real compiler driver with the correct linker +# flags for each case. +# +# Why this exists: +# +# The Nanvix build of CPython sets a single `LDFLAGS` env var on +# `./configure` that contains executable-specific flags (the linker +# script `user.ld`, `-no-pie`, `-Wl,--no-dynamic-linker`, +# `-Wl,--export-dynamic`). cpython's build system propagates that same +# `LDFLAGS` to BOTH the main `python.elf` link and to every extension +# module `.so` link. For `.so` outputs those exe-only flags are wrong: +# +# - `-T user.ld` tells `ld` to use an executable layout. +# When applied to a `-shared` link, `ld` +# treats the output as an exe and rejects +# any undefined symbol -- even those that +# should resolve at dlopen() time against +# the main exe's `.dynsym` (the C API +# symbols every Python extension references). +# - `-no-pie` PIE-disable. Shared libraries must be PIC. +# - `-Wl,--no-dynamic-linker` meaningless for `.so`. +# - `-Wl,--export-dynamic` exe-only. +# +# This wrapper makes the build system's single `LDFLAGS` value +# "do the right thing" for both modes, without forcing each Makefile +# that consumes the toolchain to know the difference. +# +# Behaviour: +# +# - If the invocation is compile-only (any of `-c` / `-S` / `-E`): +# forward unchanged to the real compiler. +# - If the invocation does NOT contain `-shared`: +# executable link (or pure compile in the rare case +# of no `-c`/`-S`/`-E` and no `-shared`). Forward unchanged. +# - If the invocation contains `-shared`: +# shared-library link. Strip the exe-only flags listed above +# and ensure `-fPIC` is present. Forward. +# +# The wrapper is invoked by symlink: i686-nanvix-gcc -> cc-wrapper.sh +# and i686-nanvix-g++ -> cc-wrapper.sh. The wrapper picks the right +# real binary based on its own argv[0] (i.e. how it was invoked). +# +# Each real binary is preserved as `.real` alongside the wrapper: +# i686-nanvix-gcc.real, i686-nanvix-g++.real. +# +# See nanvix-todo/c-extension-compiler-wrapper.md for the design note. + +set -e + +# Find the real binary by appending `.real` to argv[0]'s basename. +self_dir="$(dirname "$0")" +self_name="$(basename "$0")" +real_bin="${self_dir}/${self_name}.real" + +if [ ! -x "$real_bin" ]; then + echo "cc-wrapper: real binary not found at $real_bin" >&2 + exit 1 +fi + +# Detect mode: compile-only, exe link, or shared link. +shared=0 +compile_only=0 +for arg in "$@"; do + case "$arg" in + -shared) shared=1 ;; + -c|-S|-E) compile_only=1 ;; + esac +done + +if [ "$compile_only" = "1" ] || [ "$shared" = "0" ]; then + # Compile-only or exe link: forward unchanged. + exec "$real_bin" "$@" +fi + +# Shared-library link: strip exe-only flags and ensure -fPIC. +filtered=() +skip_next=0 +have_fpic=0 +for arg in "$@"; do + if [ "$skip_next" = "1" ]; then + skip_next=0 + continue + fi + case "$arg" in + -T) skip_next=1 ;; + -T*) ;; + -no-pie) ;; + -Wl,--no-dynamic-linker) ;; + -Wl,--export-dynamic) ;; + -Wl,-T,*) ;; + *.ld) ;; + -fPIC) + have_fpic=1 + filtered+=("$arg") + ;; + *) filtered+=("$arg") ;; + esac +done + +if [ "$have_fpic" = "0" ]; then + filtered=(-fPIC "${filtered[@]}") +fi + +exec "$real_bin" "${filtered[@]}" diff --git a/.nanvix/lxml.py b/.nanvix/lxml.py index 58c7565425575c8..5211ad67aaa80f5 100644 --- a/.nanvix/lxml.py +++ b/.nanvix/lxml.py @@ -14,13 +14,21 @@ _SETUP_LOCAL_TEMPLATE = """\ # Auto-generated by .nanvix/lxml.py -- do not edit manually. + # Statically-linked extension modules for Nanvix builds. -# +*static* # Nanvix OS interface module (snapshot, host-mount). _nanvix _nanvixmodule.c # lxml C extension modules (statically linked via pre-built archives). _lxml_etree lxml_etree_builtin.c -L{sysroot}/lib -llxml_etree -lxslt -lexslt -lxml2 -lz _lxml_elementpath lxml_elementpath_builtin.c -L{sysroot}/lib -llxml_elementpath -lxml2 -lz + +# Phase 0 of the .a -> .so migration: array as proof-of-concept shared module. +# See nanvix-todo/cpython-static-to-shared-migration.md section 4. +# Listed BEFORE Setup.stdlib's static declaration so makesetup's +# "first rule wins" semantics make this shared variant take precedence. +*shared* +array arraymodule.c """ diff --git a/.nanvix/test.py b/.nanvix/test.py index 8a132eac88c698d..ddc8efa3d26c65e 100644 --- a/.nanvix/test.py +++ b/.nanvix/test.py @@ -448,6 +448,21 @@ def stage( # filesystem I/O goes through nanvixd's virtualized host-FS layer. hello_script = sysroot_dir / "test_hello.py" standalone = process_mode == "standalone" + # Phase 0 of the .a -> .so migration: `array` is now a shared + # extension at lib/python3.12/lib-dynload/array.cpython-312.so + # (built from `*shared* array arraymodule.c` in Setup.local). + # Asserting it is NOT in `sys.builtin_module_names` proves the + # dlopen path is exercised end-to-end; if the .so failed to load, + # the import would raise. + array_snippet = ( + "import array\n" + "assert 'array' not in sys.builtin_module_names, " + "'array still built-in!'\n" + "_a = array.array('i', [1, 2, 3])\n" + "assert _a.tolist() == [1, 2, 3], f'array contents wrong: {_a.tolist()}'\n" + "print(f'CPYTHON_TEST_ARRAY_SO: array loaded via dlopen from " + "{array.__file__}')\n" + ) lxml_snippet = ( "try:\n" " import lxml.etree\n" @@ -465,6 +480,7 @@ def stage( "import sys\n" "print('CPYTHON_TEST_HELLO: Hello from Python', sys.version_info[:2])\n" "print('CPYTHON_TEST_PLATFORM:', sys.platform)\n" + + array_snippet + (lxml_snippet if standalone else ""), ) diff --git a/Makefile.nanvix b/Makefile.nanvix index 4a11e1a19b3b028..55636235b10d825 100644 --- a/Makefile.nanvix +++ b/Makefile.nanvix @@ -91,6 +91,10 @@ ifdef CONFIG_NANVIX LIBSQLITE3 := $(DOCKER_SYSROOT_PATH)/lib/libsqlite3.a LIBSSL := $(DOCKER_SYSROOT_PATH)/lib/libssl.a LIBCRYPTO := $(DOCKER_SYSROOT_PATH)/lib/libcrypto.a + # libnvx_crt0 ships the executable startup symbols (`_do_start`, `_start`, + # `c_trampoline`). It must be present in the Nanvix sysroot ahead of this + # cpython build; the existence check below fails loudly when it is not. + LIBNVX_CRT0 := $(DOCKER_SYSROOT_PATH)/lib/libnvx_crt0.a BUILD_PYTHON := $(DOCKER_TOOLCHAIN_PATH)/bin/python3 else TOOLCHAIN_PREFIX := $(NANVIX_TOOLCHAIN) @@ -102,8 +106,18 @@ ifdef CONFIG_NANVIX LIBSQLITE3 := $(abspath $(NANVIX_HOME))/lib/libsqlite3.a LIBSSL := $(abspath $(NANVIX_HOME))/lib/libssl.a LIBCRYPTO := $(abspath $(NANVIX_HOME))/lib/libcrypto.a + LIBNVX_CRT0 := $(abspath $(NANVIX_HOME))/lib/libnvx_crt0.a BUILD_PYTHON := $(NANVIX_TOOLCHAIN)/bin/python3 endif + + # libstdc++ / libgcc are referenced via `-l` rather than absolute paths so + # the GCC driver resolves them: libgcc lives under a versioned dir + # (`lib/gcc/i686-nanvix//libgcc.a`) and hardcoding a version + # would be fragile across toolchain upgrades. Defined once at top-level + # because the `-l` form is identical between the docker and host build + # paths above. + LIBSTDCXX := -lstdc++ + LIBGCC := -lgcc else ifneq ($(MAKECMDGOALS),clean) ifneq ($(MAKECMDGOALS),distclean) @@ -113,7 +127,69 @@ else EXE=.elf endif +# Existence check for libnvx_crt0.a. +# +# libnvx_crt0.a is the executable startup archive (`_do_start` / `_start` / +# `c_trampoline`) introduced by the Nanvix `nvx-crt0` crate split. cpython +# requires a Nanvix sysroot that ships it. If the user is building against +# an older sysroot snapshot, fail at make-parse time with an actionable +# message rather than producing a python.elf with no entry point. +ifdef CONFIG_NANVIX +ifneq ($(filter clean distclean,$(MAKECMDGOALS)),$(MAKECMDGOALS)) +ifeq ($(wildcard $(LIBNVX_CRT0)),) +$(error libnvx_crt0.a not found at $(LIBNVX_CRT0). Update the Nanvix sysroot to one that ships libnvx_crt0.a (the nvx-crt0 crate must be present and built into the sysroot lib/ directory).) +endif +endif +endif + # Configure environment variables +# +# Linker flag rationale: +# +# `-Wl,--export-dynamic`: put every globally-defined symbol from python.elf +# into its `.dynsym`. Extension `.so`s (numpy's `_multiarray_umath.so`, ssl, +# etc.) leave C/C++ runtime symbols UND and resolve them against python.elf +# at dlopen() time. Without this, those `.so`s fail to load. +# +# `-Wl,--allow-multiple-definition`: tolerates duplicate symbol definitions +# that arise from forcing every libposix / libc / libm / libstdc++ / libgcc +# object into python.elf via `--whole-archive` below. The biggest set is +# newlib's long-double math helpers (`frexpl`, `llrintl`, `lrintl`, `rintl` +# are defined in three different newlib directories simultaneously — a +# newlib build-system bug). Other known overlaps include libposix vs. libc +# (`_start`, `copysign[f]`, `getenv`, `setenv`, `unsetenv`, `environ`, +# `isatty`), libc vs. libm (`frexp`, `ldexp`, `modf`, `isnan`, `isinf`, +# `scalbn`, …), libm vs. libstdc++ (`hypotf`), libgcc internal duplicates +# (`__x86.get_pc_thunk.*`), and a libc / libgcc `__eprintf`. The set is +# large and toolchain-build-version-dependent; treating the link as +# multiple-definition-tolerant is the only practical workaround until each +# upstream is fixed. Remove this flag once the contributing duplicates are +# resolved upstream. +# +# `LIBS` segment 1 (`--whole-archive ... --no-whole-archive`): force every +# object from libnvx_crt0, libposix, libc, libm, libstdc++, and libgcc into +# python.elf so the runtime symbols extension `.so`s depend on are embedded +# and re-exported via `--export-dynamic`. Without `--whole-archive`, the +# static linker drops unreferenced objects (e.g. `fscanf`, `longjmp`, +# `strtold_l` for numpy; `operator new/delete[]`, `__cxa_*`, `_Unwind_*`, +# `std::type_info` vtables for any C++ extension) and subsequent dlopen() +# of those `.so`s fails with "symbol not found". +# +# `libnvx_crt0` is listed first inside `--whole-archive`. Today, libposix.a +# still ships its own copy of the startup symbols (`_start`, `_do_start`, +# `c_trampoline`) because `nvx` builds them in under the `staticlib` +# feature. With `--allow-multiple-definition` (above) the linker takes the +# first definition, so listing libnvx_crt0 first selects the standalone +# crt0 copy of `_start` and friends. This is an intentional behaviour +# change: future Nanvix versions remove the duplicate from libposix, after +# which libnvx_crt0 is the sole provider. Listing it first today keeps +# python.elf using a consistent `_start` source across both states. +# +# `LIBS` segment 2 (`--start-group ... --end-group`): the external add-on +# libraries (sqlite3, ssl, crypto, z, bz2, lzma, ffi). The group is needed +# only for their inter-archive circular dependencies; they resolve symbols +# from libposix/libc/libm/libstdc++ against the already-embedded objects +# from segment 1. CONFIGURE_ENV = \ CC="$(TOOLCHAIN_PREFIX)/bin/i686-nanvix-gcc" \ CXX="$(TOOLCHAIN_PREFIX)/bin/i686-nanvix-g++" \ @@ -123,7 +199,7 @@ CONFIGURE_ENV = \ CFLAGS="-O3 -fomit-frame-pointer -fno-unwind-tables -fno-asynchronous-unwind-tables -I$(SYSROOT_PATH)/include" \ CFLAGS_NODIST="-fno-semantic-interposition" \ LDFLAGS="-L$(SYSROOT_PATH)/lib -T$(SYSROOT_PATH)/lib/user.ld -Wl,--allow-multiple-definition -no-pie -Wl,--export-dynamic -Wl,--no-dynamic-linker" \ - LIBS="-Wl,--start-group $(LIBPOSIX) $(LIBC) $(LIBM) -lsqlite3 -lssl -lcrypto -lz -lbz2 -llzma -lffi -Wl,--end-group" \ + LIBS="-Wl,--whole-archive $(LIBNVX_CRT0) $(LIBPOSIX) $(LIBC) $(LIBM) $(LIBSTDCXX) $(LIBGCC) -Wl,--no-whole-archive -Wl,--start-group -lsqlite3 -lssl -lcrypto -lz -lbz2 -llzma -lffi -Wl,--end-group" \ LIBSQLITE3_LIBS="-L$(SYSROOT_PATH)/lib -lsqlite3" \ LIBSQLITE3_CFLAGS="-I$(SYSROOT_PATH)/include" \ ZLIB_LIBS="-L$(SYSROOT_PATH)/lib -lz" \