From ffdb42a7432d1e87e6a0d8b2c56ef9a4eba33be5 Mon Sep 17 00:00:00 2001 From: Yongming Wang Date: Mon, 18 May 2026 22:19:32 -0700 Subject: [PATCH 1/5] feat(subinterpreter): add opt-in TLS-cached thread state mode subinterpreter_scoped_activate previously created and destroyed a fresh PyThreadState on every activation when the calling OS thread was not already running the target interpreter. Workloads that repeatedly re-enter the same sub-interpreter from the same thread therefore churn thread states and lose per-thread interpreter state between activations (see pybind/pybind11#6040). Add an opt-in subinterpreter_thread_state::cached policy: on first use a PyThreadState is created and stored in OS-thread-local storage keyed by the target interpreter; subsequent activations on that thread only swap it in/out and never destroy it. The default stays transient, so existing behavior is unchanged. Since pybind11 does not control thread lifetime, cleanup is explicit: subinterpreter::release_cached_thread_state() releases the calling thread's cached state for one interpreter, and the static release_all_cached_thread_states() releases all of the calling thread's cached states as an end-of-thread hook. The TLS map's destructor only frees its own nodes and never touches the Python C API, so an unreleased state leaks rather than crashing at thread exit. Includes test coverage and embedding docs. Co-Authored-By: Claude Opus 4.7 --- docs/advanced/embedding.rst | 60 ++++++++ include/pybind11/subinterpreter.h | 131 +++++++++++++++++- tests/test_with_catch/test_subinterpreter.cpp | 55 ++++++++ 3 files changed, 239 insertions(+), 7 deletions(-) diff --git a/docs/advanced/embedding.rst b/docs/advanced/embedding.rst index c41aec152b..29678480f3 100644 --- a/docs/advanced/embedding.rst +++ b/docs/advanced/embedding.rst @@ -345,6 +345,66 @@ Example: } +Reusing a thread state (cached activation) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, when a thread activates a sub-interpreter it is not already running, +:class:`subinterpreter_scoped_activate` creates a fresh ``PyThreadState`` and +destroys it when the scope ends. A thread that repeatedly enters the same +sub-interpreter therefore allocates and frees a thread state every time, and +does **not** preserve any per-thread interpreter state between activations. + +For workloads that repeatedly re-enter the same sub-interpreter from the same OS +thread, you can opt into a cached mode by passing +:enum:`subinterpreter_thread_state::cached`: + +.. code-block:: cpp + + { + // First activation on this OS thread creates a PyThreadState and caches it + // in thread-local storage, keyed by the target interpreter. + py::subinterpreter_scoped_activate guard( + sub, py::subinterpreter_thread_state::cached); + // ... use the sub-interpreter ... + } + // The PyThreadState is swapped out but NOT destroyed. + + { + // Subsequent activations on the same OS thread reuse the cached + // PyThreadState (only a swap, no allocation) and preserve its + // per-thread interpreter state. + py::subinterpreter_scoped_activate guard( + sub, py::subinterpreter_thread_state::cached); + } + +The default behavior is unchanged: the parameter defaults to +:enum:`subinterpreter_thread_state::transient`, and the cache is only consulted +when ``cached`` is explicitly requested. ``transient`` and ``cached`` +activations never share a thread state, even for the same interpreter on the +same thread. + +Because pybind11 does not control thread creation or destruction, a cached +``PyThreadState`` is **not** destroyed automatically. The owning OS thread must +explicitly release it before the sub-interpreter is destroyed (and before that +thread exits, to avoid a leak): + +- :func:`subinterpreter::release_cached_thread_state` destroys the cached + thread state that the **calling** OS thread created for **that one** + sub-interpreter. +- :func:`subinterpreter::release_all_cached_thread_states` destroys every + cached thread state the **calling** OS thread created (for any + sub-interpreter); it is a convenient end-of-thread cleanup hook. + +.. warning:: + + Call the release functions on the same OS thread that activated the + sub-interpreter, while that sub-interpreter is still alive, and while no + :class:`subinterpreter_scoped_activate` scope using the cached state is + active on that thread. Destroying a cached ``PyThreadState`` whose + interpreter has already been finalized is undefined behavior. The cache is + per OS thread: a thread cannot release another thread's cached states, so + each worker thread is responsible for its own cleanup before it exits. + GIL API for sub-interpreters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/pybind11/subinterpreter.h b/include/pybind11/subinterpreter.h index 547545263e..4c12654ec8 100644 --- a/include/pybind11/subinterpreter.h +++ b/include/pybind11/subinterpreter.h @@ -14,6 +14,7 @@ #include "gil.h" #include +#include #ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT # error "This platform does not support subinterpreters, do not include this file." @@ -21,6 +22,42 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +/// OS-thread-local cache mapping a target interpreter to the PyThreadState that was created for +/// it *on the current OS thread*. Used by subinterpreter_scoped_activate when +/// subinterpreter_thread_state::cached is requested, so that repeatedly entering the same +/// interpreter from the same OS thread reuses one PyThreadState (swapped in/out) instead of +/// allocating and destroying a fresh one each time. +/// +/// The values are raw, non-owning pointers. At thread exit the map's destructor only frees its +/// own nodes; it deliberately does NOT touch the Python C API (which may be unusable at that +/// point), so a cached PyThreadState is leaked unless the owning thread first calls +/// subinterpreter::release_cached_thread_state() or +/// subinterpreter::release_all_cached_thread_states(). +inline std::unordered_map & +subinterpreter_thread_state_cache() { + thread_local std::unordered_map cache; + return cache; +} + +PYBIND11_NAMESPACE_END(detail) + +/// Selects how subinterpreter_scoped_activate obtains a PyThreadState when the calling OS thread +/// is not already running the target interpreter. +enum class subinterpreter_thread_state { + /// Default / legacy behavior: a fresh PyThreadState is created on activation and destroyed + /// when the scope exits. + transient, + /// Reuse (or, on first use, create-and-cache) a PyThreadState held in OS-thread-local + /// storage, keyed by the target interpreter. The scope only swaps it in and out and never + /// destroys it. The owning thread is responsible for eventually destroying it via + /// subinterpreter::release_cached_thread_state() / + /// subinterpreter::release_all_cached_thread_states(); see those functions for the + /// preconditions. + cached +}; + class subinterpreter; /// Activate the subinterpreter and acquire its GIL, while also releasing any GIL and interpreter @@ -28,7 +65,9 @@ class subinterpreter; /// associated GIL are restored to their state as they were before the scope was entered. class subinterpreter_scoped_activate { public: - explicit subinterpreter_scoped_activate(subinterpreter const &si); + explicit subinterpreter_scoped_activate( + subinterpreter const &si, + subinterpreter_thread_state ts_policy = subinterpreter_thread_state::transient); ~subinterpreter_scoped_activate(); subinterpreter_scoped_activate(subinterpreter_scoped_activate &&) = delete; @@ -41,6 +80,9 @@ class subinterpreter_scoped_activate { PyThreadState *tstate_ = nullptr; PyGILState_STATE gil_state_; bool simple_gil_ = false; + // When true, tstate_ is owned by the OS-thread-local cache and must NOT be destroyed when + // this scope exits (only swapped out). + bool cached_ = false; }; /// Holds a Python subinterpreter instance @@ -216,6 +258,26 @@ class subinterpreter { /// Get the interpreter's state dict. This interpreter's GIL must be held before calling! dict state_dict() { return reinterpret_borrow(PyInterpreterState_GetDict(istate_)); } + /// Destroy the PyThreadState (if any) that subinterpreter_thread_state::cached created for + /// THIS interpreter on the CURRENT OS thread, and drop it from that thread's cache. + /// + /// Call this on the same OS thread that activated the interpreter, while this subinterpreter + /// is still alive, and while no subinterpreter_scoped_activate scope for it is active on this + /// thread. It is a no-op if this thread has no cached state for this interpreter. The caller + /// need not hold any GIL: the cached state is briefly swapped in (acquiring this interpreter's + /// GIL) to be cleared and deleted, then whatever was active before is restored. + void release_cached_thread_state() const; + + /// Destroy every cached PyThreadState that was created on the CURRENT OS thread (for any + /// interpreter) and clear this thread's cache. Intended as an end-of-thread cleanup hook for + /// embedder worker threads. + /// + /// Every interpreter that still has a cached state on this thread MUST still be alive when + /// this is called (deleting a PyThreadState whose interpreter was already finalized is + /// undefined behavior). Must be called on the OS thread that owns the cache, with no + /// subinterpreter_scoped_activate scope using a cached state active on this thread. + static void release_all_cached_thread_states(); + /// abandon cleanup of this subinterpreter (leak it). this might be needed during /// finalization... void disarm() { creation_tstate_ = nullptr; } @@ -244,7 +306,8 @@ class scoped_subinterpreter { subinterpreter_scoped_activate scope_; }; -inline subinterpreter_scoped_activate::subinterpreter_scoped_activate(subinterpreter const &si) { +inline subinterpreter_scoped_activate::subinterpreter_scoped_activate( + subinterpreter const &si, subinterpreter_thread_state ts_policy) { if (!si.istate_) { pybind11_fail("null subinterpreter"); } @@ -256,9 +319,25 @@ inline subinterpreter_scoped_activate::subinterpreter_scoped_activate(subinterpr return; } - // we can't really interact with the interpreter at all until we switch to it - // not even to, for example, look in its state dict or touch its internals - tstate_ = PyThreadState_New(si.istate_); + if (ts_policy == subinterpreter_thread_state::cached) { + // Reuse a PyThreadState held in this OS thread's cache, or create one and cache it. + // This preserves PyThreadState identity (and its per-thread interpreter state) across + // repeated activations of the same interpreter from the same OS thread, instead of + // creating and destroying a fresh state every time. + auto &cache = detail::subinterpreter_thread_state_cache(); + auto it = cache.find(si.istate_); + if (it != cache.end()) { + tstate_ = it->second; + } else { + tstate_ = PyThreadState_New(si.istate_); + cache.emplace(si.istate_, tstate_); + } + cached_ = true; + } else { + // we can't really interact with the interpreter at all until we switch to it + // not even to, for example, look in its state dict or touch its internals + tstate_ = PyThreadState_New(si.istate_); + } // make the interpreter active and acquire the GIL old_tstate_ = PyThreadState_Swap(tstate_); @@ -279,8 +358,12 @@ inline subinterpreter_scoped_activate::~subinterpreter_scoped_activate() { } #endif detail::get_internals().tstate.reset(); - PyThreadState_Clear(tstate_); - PyThreadState_DeleteCurrent(); + if (!cached_) { + PyThreadState_Clear(tstate_); + PyThreadState_DeleteCurrent(); + } + // When cached_, tstate_ stays alive in the OS-thread-local cache for reuse; the + // PyThreadState_Swap below merely detaches it from this thread. } // Go back the previous interpreter (if any) and acquire THAT gil @@ -288,4 +371,38 @@ inline subinterpreter_scoped_activate::~subinterpreter_scoped_activate() { } } +inline void subinterpreter::release_cached_thread_state() const { + if (istate_ == nullptr) { + return; + } + auto &cache = detail::subinterpreter_thread_state_cache(); + auto it = cache.find(istate_); + if (it == cache.end()) { + return; + } + PyThreadState *cached = it->second; + cache.erase(it); + + // Make the cached state current (acquiring this interpreter's GIL) so it can be cleared and + // destroyed on the OS thread that created it, then restore whatever was active before. + PyThreadState *prev = PyThreadState_Swap(cached); + PyThreadState_Clear(cached); + PyThreadState_DeleteCurrent(); + PyThreadState_Swap(prev); +} + +inline void subinterpreter::release_all_cached_thread_states() { + auto &cache = detail::subinterpreter_thread_state_cache(); + for (auto const &entry : cache) { + PyThreadState *cached = entry.second; + // prev is the state active before this swap; it is restored after each deletion, so it is + // never one of the cached states being destroyed here. + PyThreadState *prev = PyThreadState_Swap(cached); + PyThreadState_Clear(cached); + PyThreadState_DeleteCurrent(); + PyThreadState_Swap(prev); + } + cache.clear(); +} + PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 35b7f02334..d1028b9d19 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -153,6 +153,61 @@ TEST_CASE("Move Subinterpreter") { } # endif +TEST_CASE("Cached Subinterpreter thread state") { + py::subinterpreter sub = py::subinterpreter::create(); + + PyThreadState *main_a = nullptr; + PyThreadState *main_b = nullptr; + PyThreadState *transient_ts = nullptr; + PyThreadState *worker_ts = nullptr; + + { + py::subinterpreter_scoped_activate a(sub, py::subinterpreter_thread_state::cached); + main_a = PyThreadState_Get(); + py::list(py::module_::import("sys").attr("path")).append(py::str(".")); + } + { + py::subinterpreter_scoped_activate a(sub, py::subinterpreter_thread_state::cached); + main_b = PyThreadState_Get(); + } + + // Same OS thread + same interpreter + cached policy => the PyThreadState is reused. + REQUIRE(main_a != nullptr); + REQUIRE(main_a == main_b); + + // The default (transient) policy must not reuse the cached state: while the cached state is + // still alive, a transient activation gets a distinct PyThreadState. + { + py::subinterpreter_scoped_activate a(sub); + transient_ts = PyThreadState_Get(); + } + REQUIRE(transient_ts != main_a); + + // A different OS thread gets its own cached state (both alive => distinct pointers). + { + py::gil_scoped_release nogil; + std::thread([&]() { + { + py::subinterpreter_scoped_activate a(sub, + py::subinterpreter_thread_state::cached); + worker_ts = PyThreadState_Get(); + } + // The owning thread releases its own cached state before exiting. + sub.release_cached_thread_state(); + }).join(); + } + REQUIRE(worker_ts != nullptr); + REQUIRE(worker_ts != main_a); + + // Release this thread's cached state; a second release is a safe no-op, and release_all on a + // now-empty cache must not crash. + sub.release_cached_thread_state(); + sub.release_cached_thread_state(); + py::subinterpreter::release_all_cached_thread_states(); + + unsafe_reset_internals_for_single_interpreter(); +} + TEST_CASE("GIL Subinterpreter") { PyInterpreterState *main_interp = PyInterpreterState_Get(); From c197a3c70330056f9c478d0e347bc86f726900e7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 May 2026 05:22:58 +0000 Subject: [PATCH 2/5] style: pre-commit fixes --- include/pybind11/subinterpreter.h | 6 +++--- tests/test_with_catch/test_subinterpreter.cpp | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/pybind11/subinterpreter.h b/include/pybind11/subinterpreter.h index 4c12654ec8..359f03ed71 100644 --- a/include/pybind11/subinterpreter.h +++ b/include/pybind11/subinterpreter.h @@ -65,9 +65,9 @@ class subinterpreter; /// associated GIL are restored to their state as they were before the scope was entered. class subinterpreter_scoped_activate { public: - explicit subinterpreter_scoped_activate( - subinterpreter const &si, - subinterpreter_thread_state ts_policy = subinterpreter_thread_state::transient); + explicit subinterpreter_scoped_activate(subinterpreter const &si, + subinterpreter_thread_state ts_policy + = subinterpreter_thread_state::transient); ~subinterpreter_scoped_activate(); subinterpreter_scoped_activate(subinterpreter_scoped_activate &&) = delete; diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index d1028b9d19..56415ec497 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -188,8 +188,7 @@ TEST_CASE("Cached Subinterpreter thread state") { py::gil_scoped_release nogil; std::thread([&]() { { - py::subinterpreter_scoped_activate a(sub, - py::subinterpreter_thread_state::cached); + py::subinterpreter_scoped_activate a(sub, py::subinterpreter_thread_state::cached); worker_ts = PyThreadState_Get(); } // The owning thread releases its own cached state before exiting. From e7f5948752e1dfee93c34144359ac8429482ebfc Mon Sep 17 00:00:00 2001 From: Yongming Wang Date: Tue, 19 May 2026 21:11:12 -0700 Subject: [PATCH 3/5] refactor(subinterpreter): replace cached enum/TLS with subinterpreter_thread_state RAII Address review feedback on the original "cached" mode by switching to an explicit two-RAII design suggested by @b-pass: "Create a class ... to RAII-manage the PyThreadState but start its lifetime in an already released state. You could create another class (or modify scoped_activate) to scoped/RAII activate the inactive threadstate." Removed - enum subinterpreter_thread_state { transient, cached } and the defaulted ctor parameter on subinterpreter_scoped_activate. - detail::subinterpreter_thread_state_cache thread_local map. - subinterpreter::release_cached_thread_state() and subinterpreter::release_all_cached_thread_states(). This eliminates: the hidden per-thread map, the "release_all" footgun across pybind11 modules (the cache was module-local), and the implicit "must not be active when called" contract on the release functions. Added - Public class subinterpreter_thread_state that owns one PyThreadState for a given subinterpreter on its constructing OS thread, created in a released state (not current, no GIL). Non-copyable, non-movable (PyThreadState is bound to its creating OS thread). - subinterpreter_scoped_activate(subinterpreter_thread_state &) overload: swaps the owned PyThreadState in on entry, swaps it out on exit, does not touch its lifetime. Behavior - The existing subinterpreter_scoped_activate(subinterpreter const &) overload is unchanged (still transient: New on entry, Delete on exit). All previously-working code keeps working. - With subinterpreter_thread_state, one OS thread can alternate between multiple subinterpreters and each PyThreadState is preserved across activations -- the use case that gil_scoped_release/acquire + a long-lived scoped_activate cannot solve alone (the per-thread internals.tstate slot holds only one inactive tstate). - The dtor of subinterpreter_thread_state guards against the "destroyed-while-active" contract violation: if Swap reveals the cached tstate was current, do not Swap back to a now-deleted pointer (the safe-when-active fix b-pass requested for the old release_* functions, applied at the natural location instead). Lifetime contract is enforced by ordinary C++ scope: typical placement is `thread_local`. No new release/cleanup APIs are required. Tests cover (a) tstate identity preserved across activations on a thread, (b) transient and reusing modes do not share state, (c) different OS threads get distinct PyThreadStates, and (d) the multi-subinterpreter alternation case. Co-Authored-By: Claude Opus 4.7 --- docs/advanced/embedding.rst | 90 ++++--- include/pybind11/subinterpreter.h | 251 ++++++++++-------- tests/test_with_catch/test_subinterpreter.cpp | 117 +++++--- 3 files changed, 267 insertions(+), 191 deletions(-) diff --git a/docs/advanced/embedding.rst b/docs/advanced/embedding.rst index 29678480f3..9bd56889fc 100644 --- a/docs/advanced/embedding.rst +++ b/docs/advanced/embedding.rst @@ -345,65 +345,71 @@ Example: } -Reusing a thread state (cached activation) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Reusing a thread state across activations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By default, when a thread activates a sub-interpreter it is not already running, -:class:`subinterpreter_scoped_activate` creates a fresh ``PyThreadState`` and -destroys it when the scope ends. A thread that repeatedly enters the same -sub-interpreter therefore allocates and frees a thread state every time, and -does **not** preserve any per-thread interpreter state between activations. +By default, :class:`subinterpreter_scoped_activate` creates a fresh +``PyThreadState`` on entry and destroys it on exit. A thread that repeatedly +re-enters the same sub-interpreter therefore allocates and frees a thread state +every time, and does **not** preserve any per-thread interpreter state between +activations. -For workloads that repeatedly re-enter the same sub-interpreter from the same OS -thread, you can opt into a cached mode by passing -:enum:`subinterpreter_thread_state::cached`: +For workloads where a single OS thread re-enters one or more sub-interpreters +many times, pybind11 provides :class:`subinterpreter_thread_state` — an RAII +object that owns a ``PyThreadState`` and lets you swap it in for the duration +of each :class:`subinterpreter_scoped_activate` scope without destroying it +between activations: .. code-block:: cpp + // Create the PyThreadState once. It is created in a "released" state: + // not current, no GIL acquired. + thread_local py::subinterpreter_thread_state ts(sub); + { - // First activation on this OS thread creates a PyThreadState and caches it - // in thread-local storage, keyed by the target interpreter. - py::subinterpreter_scoped_activate guard( - sub, py::subinterpreter_thread_state::cached); + // Swap it in; the subinterpreter's GIL is acquired. + py::subinterpreter_scoped_activate guard(ts); // ... use the sub-interpreter ... } - // The PyThreadState is swapped out but NOT destroyed. + // Swap-out only; the PyThreadState is kept alive in `ts`. { - // Subsequent activations on the same OS thread reuse the cached - // PyThreadState (only a swap, no allocation) and preserve its - // per-thread interpreter state. - py::subinterpreter_scoped_activate guard( - sub, py::subinterpreter_thread_state::cached); + py::subinterpreter_scoped_activate guard(ts); + // The same PyThreadState is re-used; its per-thread interpreter state + // is preserved across activations. } -The default behavior is unchanged: the parameter defaults to -:enum:`subinterpreter_thread_state::transient`, and the cache is only consulted -when ``cached`` is explicitly requested. ``transient`` and ``cached`` -activations never share a thread state, even for the same interpreter on the -same thread. +This composes naturally with multiple sub-interpreters on the same OS thread: +hold one :class:`subinterpreter_thread_state` per sub-interpreter and alternate +between them. Each ``PyThreadState`` is independent and is preserved across +activations. + +.. code-block:: cpp + + thread_local py::subinterpreter_thread_state ts_a(sub_a); + thread_local py::subinterpreter_thread_state ts_b(sub_b); -Because pybind11 does not control thread creation or destruction, a cached -``PyThreadState`` is **not** destroyed automatically. The owning OS thread must -explicitly release it before the sub-interpreter is destroyed (and before that -thread exits, to avoid a leak): + { py::subinterpreter_scoped_activate guard(ts_a); /* in sub_a */ } + { py::subinterpreter_scoped_activate guard(ts_b); /* in sub_b */ } + { py::subinterpreter_scoped_activate guard(ts_a); /* same PyThreadState as before */ } -- :func:`subinterpreter::release_cached_thread_state` destroys the cached - thread state that the **calling** OS thread created for **that one** - sub-interpreter. -- :func:`subinterpreter::release_all_cached_thread_states` destroys every - cached thread state the **calling** OS thread created (for any - sub-interpreter); it is a convenient end-of-thread cleanup hook. +The default behavior is unchanged: the +:class:`subinterpreter_scoped_activate(subinterpreter const&)` overload still +creates and destroys a transient ``PyThreadState`` per scope, and it never +shares a thread state with any :class:`subinterpreter_thread_state` that may +also exist for the same sub-interpreter on the same thread. .. warning:: - Call the release functions on the same OS thread that activated the - sub-interpreter, while that sub-interpreter is still alive, and while no - :class:`subinterpreter_scoped_activate` scope using the cached state is - active on that thread. Destroying a cached ``PyThreadState`` whose - interpreter has already been finalized is undefined behavior. The cache is - per OS thread: a thread cannot release another thread's cached states, so - each worker thread is responsible for its own cleanup before it exits. + Lifetime and threading requirements for :class:`subinterpreter_thread_state`: + + - It must be constructed and destroyed on the **same OS thread**. A + ``PyThreadState`` is bound to its creating thread; deleting it on another + thread is undefined behavior. Holding the object as a ``thread_local`` + satisfies this automatically. + - It must be destroyed while its sub-interpreter is still alive. + - It must **not** be destroyed while a :class:`subinterpreter_scoped_activate` + referring to it is alive — the activator holds a reference into it. GIL API for sub-interpreters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/pybind11/subinterpreter.h b/include/pybind11/subinterpreter.h index 359f03ed71..2b4a72552f 100644 --- a/include/pybind11/subinterpreter.h +++ b/include/pybind11/subinterpreter.h @@ -14,7 +14,6 @@ #include "gil.h" #include -#include #ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT # error "This platform does not support subinterpreters, do not include this file." @@ -22,52 +21,28 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) -PYBIND11_NAMESPACE_BEGIN(detail) - -/// OS-thread-local cache mapping a target interpreter to the PyThreadState that was created for -/// it *on the current OS thread*. Used by subinterpreter_scoped_activate when -/// subinterpreter_thread_state::cached is requested, so that repeatedly entering the same -/// interpreter from the same OS thread reuses one PyThreadState (swapped in/out) instead of -/// allocating and destroying a fresh one each time. -/// -/// The values are raw, non-owning pointers. At thread exit the map's destructor only frees its -/// own nodes; it deliberately does NOT touch the Python C API (which may be unusable at that -/// point), so a cached PyThreadState is leaked unless the owning thread first calls -/// subinterpreter::release_cached_thread_state() or -/// subinterpreter::release_all_cached_thread_states(). -inline std::unordered_map & -subinterpreter_thread_state_cache() { - thread_local std::unordered_map cache; - return cache; -} - -PYBIND11_NAMESPACE_END(detail) - -/// Selects how subinterpreter_scoped_activate obtains a PyThreadState when the calling OS thread -/// is not already running the target interpreter. -enum class subinterpreter_thread_state { - /// Default / legacy behavior: a fresh PyThreadState is created on activation and destroyed - /// when the scope exits. - transient, - /// Reuse (or, on first use, create-and-cache) a PyThreadState held in OS-thread-local - /// storage, keyed by the target interpreter. The scope only swaps it in and out and never - /// destroys it. The owning thread is responsible for eventually destroying it via - /// subinterpreter::release_cached_thread_state() / - /// subinterpreter::release_all_cached_thread_states(); see those functions for the - /// preconditions. - cached -}; - class subinterpreter; +class subinterpreter_thread_state; /// Activate the subinterpreter and acquire its GIL, while also releasing any GIL and interpreter /// currently held. Upon exiting the scope, the previous subinterpreter (if any) and its /// associated GIL are restored to their state as they were before the scope was entered. +/// +/// Two construction modes are supported: +/// +/// 1. `subinterpreter_scoped_activate(subinterpreter const &)`: +/// Transient mode (the default). A fresh PyThreadState is created on entry and destroyed on +/// exit. This is the established behavior; existing code is unaffected. +/// +/// 2. `subinterpreter_scoped_activate(subinterpreter_thread_state &)`: +/// Reuse mode. The PyThreadState owned by the given subinterpreter_thread_state is swapped +/// in on entry and swapped out (but NOT destroyed) on exit, so repeated activations on the +/// same OS thread reuse the same PyThreadState and preserve its per-thread interpreter state. +/// Use this when a single OS thread re-enters one or more subinterpreters many times. class subinterpreter_scoped_activate { public: - explicit subinterpreter_scoped_activate(subinterpreter const &si, - subinterpreter_thread_state ts_policy - = subinterpreter_thread_state::transient); + explicit subinterpreter_scoped_activate(subinterpreter const &si); + explicit subinterpreter_scoped_activate(subinterpreter_thread_state &ts); ~subinterpreter_scoped_activate(); subinterpreter_scoped_activate(subinterpreter_scoped_activate &&) = delete; @@ -80,9 +55,9 @@ class subinterpreter_scoped_activate { PyThreadState *tstate_ = nullptr; PyGILState_STATE gil_state_; bool simple_gil_ = false; - // When true, tstate_ is owned by the OS-thread-local cache and must NOT be destroyed when - // this scope exits (only swapped out). - bool cached_ = false; + // When true, tstate_ is owned by a subinterpreter_thread_state and must NOT be destroyed + // when this scope exits (only swapped out). + bool borrowed_ = false; }; /// Holds a Python subinterpreter instance @@ -258,26 +233,6 @@ class subinterpreter { /// Get the interpreter's state dict. This interpreter's GIL must be held before calling! dict state_dict() { return reinterpret_borrow(PyInterpreterState_GetDict(istate_)); } - /// Destroy the PyThreadState (if any) that subinterpreter_thread_state::cached created for - /// THIS interpreter on the CURRENT OS thread, and drop it from that thread's cache. - /// - /// Call this on the same OS thread that activated the interpreter, while this subinterpreter - /// is still alive, and while no subinterpreter_scoped_activate scope for it is active on this - /// thread. It is a no-op if this thread has no cached state for this interpreter. The caller - /// need not hold any GIL: the cached state is briefly swapped in (acquiring this interpreter's - /// GIL) to be cleared and deleted, then whatever was active before is restored. - void release_cached_thread_state() const; - - /// Destroy every cached PyThreadState that was created on the CURRENT OS thread (for any - /// interpreter) and clear this thread's cache. Intended as an end-of-thread cleanup hook for - /// embedder worker threads. - /// - /// Every interpreter that still has a cached state on this thread MUST still be alive when - /// this is called (deleting a PyThreadState whose interpreter was already finalized is - /// undefined behavior). Must be called on the OS thread that owns the cache, with no - /// subinterpreter_scoped_activate scope using a cached state active on this thread. - static void release_all_cached_thread_states(); - /// abandon cleanup of this subinterpreter (leak it). this might be needed during /// finalization... void disarm() { creation_tstate_ = nullptr; } @@ -290,10 +245,71 @@ class subinterpreter { private: friend class subinterpreter_scoped_activate; + friend class subinterpreter_thread_state; PyInterpreterState *istate_ = nullptr; PyThreadState *creation_tstate_ = nullptr; }; +/// RAII wrapper that owns a PyThreadState bound to a specific subinterpreter on the OS thread +/// that constructed it. Intended to be held long-lived (e.g. as a `thread_local`, or inside a +/// per-thread struct) so that many subinterpreter_scoped_activate scopes on the same OS thread +/// can reuse a single PyThreadState instead of creating and destroying one each time. +/// +/// The PyThreadState is created on construction in a *released* state: it is NOT made current, +/// and no GIL is acquired. Activation is the job of subinterpreter_scoped_activate. +/// +/// A single OS thread can hold one of these per subinterpreter and alternate between them via +/// subinterpreter_scoped_activate without churning PyThreadState objects. +/// +/// Lifetime / threading requirements: +/// +/// - Construction and destruction must happen on the SAME OS thread (a PyThreadState is bound +/// to the OS thread that created it; deleting it on a different thread is undefined behavior). +/// - The owning subinterpreter must still be alive when this object is destroyed. +/// - This object must NOT be destroyed while a subinterpreter_scoped_activate referring to it is +/// still alive (the activator holds a reference into it). +/// +/// Typical usage: +/// +/// @code +/// thread_local py::subinterpreter_thread_state ts(sub); +/// { +/// py::subinterpreter_scoped_activate guard(ts); // swap-in only +/// // ... use the subinterpreter ... +/// } // swap-out, tstate kept alive +/// { +/// py::subinterpreter_scoped_activate guard(ts); // reuses the same PyThreadState +/// // ... +/// } +/// @endcode +class subinterpreter_thread_state { +public: + /// Create a PyThreadState for `si` on the calling OS thread. The new state is left in a + /// released state (not current, no GIL acquired). + explicit subinterpreter_thread_state(subinterpreter const &si); + + /// Destroy the owned PyThreadState. Must run on the same OS thread that constructed this + /// object, while the owning subinterpreter is still alive, and while no + /// subinterpreter_scoped_activate referring to this object is alive. + ~subinterpreter_thread_state(); + + subinterpreter_thread_state(subinterpreter_thread_state const &) = delete; + subinterpreter_thread_state(subinterpreter_thread_state &&) = delete; + subinterpreter_thread_state &operator=(subinterpreter_thread_state const &) = delete; + subinterpreter_thread_state &operator=(subinterpreter_thread_state &&) = delete; + + /// The interpreter this thread state belongs to. + PyInterpreterState *interpreter_state() const { return istate_; } + + /// The owned PyThreadState pointer; valid for the lifetime of this object. + PyThreadState *raw_thread_state() const { return tstate_; } + +private: + friend class subinterpreter_scoped_activate; + PyThreadState *tstate_ = nullptr; + PyInterpreterState *istate_ = nullptr; +}; + class scoped_subinterpreter { public: scoped_subinterpreter() : si_(subinterpreter::create()), scope_(si_) {} @@ -306,8 +322,9 @@ class scoped_subinterpreter { subinterpreter_scoped_activate scope_; }; -inline subinterpreter_scoped_activate::subinterpreter_scoped_activate( - subinterpreter const &si, subinterpreter_thread_state ts_policy) { +// --- subinterpreter_scoped_activate ----------------------------------------------------------- + +inline subinterpreter_scoped_activate::subinterpreter_scoped_activate(subinterpreter const &si) { if (!si.istate_) { pybind11_fail("null subinterpreter"); } @@ -319,26 +336,37 @@ inline subinterpreter_scoped_activate::subinterpreter_scoped_activate( return; } - if (ts_policy == subinterpreter_thread_state::cached) { - // Reuse a PyThreadState held in this OS thread's cache, or create one and cache it. - // This preserves PyThreadState identity (and its per-thread interpreter state) across - // repeated activations of the same interpreter from the same OS thread, instead of - // creating and destroying a fresh state every time. - auto &cache = detail::subinterpreter_thread_state_cache(); - auto it = cache.find(si.istate_); - if (it != cache.end()) { - tstate_ = it->second; - } else { - tstate_ = PyThreadState_New(si.istate_); - cache.emplace(si.istate_, tstate_); - } - cached_ = true; - } else { - // we can't really interact with the interpreter at all until we switch to it - // not even to, for example, look in its state dict or touch its internals - tstate_ = PyThreadState_New(si.istate_); + // we can't really interact with the interpreter at all until we switch to it + // not even to, for example, look in its state dict or touch its internals + tstate_ = PyThreadState_New(si.istate_); + + // make the interpreter active and acquire the GIL + old_tstate_ = PyThreadState_Swap(tstate_); + + // save this in internals for scoped_gil calls (see also: PR #5870) + detail::get_internals().tstate = tstate_; +} + +inline subinterpreter_scoped_activate::subinterpreter_scoped_activate( + subinterpreter_thread_state &ts) { + if (ts.tstate_ == nullptr) { + pybind11_fail("subinterpreter_scoped_activate: empty subinterpreter_thread_state"); + } + + if (detail::get_interpreter_state_unchecked() == ts.istate_) { + // We are already on this interpreter -- e.g. nested activation, or a different + // PyThreadState for the same interpreter is already current on this thread. Match the + // fast path of the (subinterpreter const&) overload: just ensure the GIL is held. The + // `ts` argument's PyThreadState is intentionally NOT swapped to here; the already-current + // tstate keeps being used until the outer scope exits. + simple_gil_ = true; + gil_state_ = PyGILState_Ensure(); + return; } + tstate_ = ts.tstate_; + borrowed_ = true; + // make the interpreter active and acquire the GIL old_tstate_ = PyThreadState_Swap(tstate_); @@ -358,12 +386,12 @@ inline subinterpreter_scoped_activate::~subinterpreter_scoped_activate() { } #endif detail::get_internals().tstate.reset(); - if (!cached_) { + if (!borrowed_) { PyThreadState_Clear(tstate_); PyThreadState_DeleteCurrent(); } - // When cached_, tstate_ stays alive in the OS-thread-local cache for reuse; the - // PyThreadState_Swap below merely detaches it from this thread. + // When borrowed_, tstate_ stays alive in its owning subinterpreter_thread_state for + // reuse; the PyThreadState_Swap below merely detaches it from this thread. } // Go back the previous interpreter (if any) and acquire THAT gil @@ -371,38 +399,37 @@ inline subinterpreter_scoped_activate::~subinterpreter_scoped_activate() { } } -inline void subinterpreter::release_cached_thread_state() const { - if (istate_ == nullptr) { - return; +// --- subinterpreter_thread_state -------------------------------------------------------------- + +inline subinterpreter_thread_state::subinterpreter_thread_state(subinterpreter const &si) { + if (!si.istate_) { + pybind11_fail("subinterpreter_thread_state: null subinterpreter"); } - auto &cache = detail::subinterpreter_thread_state_cache(); - auto it = cache.find(istate_); - if (it == cache.end()) { - return; + istate_ = si.istate_; + // PyThreadState_New does not require holding any GIL and does not make the new state current. + tstate_ = PyThreadState_New(istate_); + if (tstate_ == nullptr) { + pybind11_fail("subinterpreter_thread_state: PyThreadState_New returned null"); } - PyThreadState *cached = it->second; - cache.erase(it); - - // Make the cached state current (acquiring this interpreter's GIL) so it can be cleared and - // destroyed on the OS thread that created it, then restore whatever was active before. - PyThreadState *prev = PyThreadState_Swap(cached); - PyThreadState_Clear(cached); - PyThreadState_DeleteCurrent(); - PyThreadState_Swap(prev); } -inline void subinterpreter::release_all_cached_thread_states() { - auto &cache = detail::subinterpreter_thread_state_cache(); - for (auto const &entry : cache) { - PyThreadState *cached = entry.second; - // prev is the state active before this swap; it is restored after each deletion, so it is - // never one of the cached states being destroyed here. - PyThreadState *prev = PyThreadState_Swap(cached); - PyThreadState_Clear(cached); - PyThreadState_DeleteCurrent(); +inline subinterpreter_thread_state::~subinterpreter_thread_state() { + if (tstate_ == nullptr) { + return; + } + // The PyThreadState must be made current to be cleared and deleted on the owning OS thread. + // Swap it in (which acquires the subinterpreter's GIL), clear+delete, then restore whatever + // was active before. + PyThreadState *prev = PyThreadState_Swap(tstate_); + PyThreadState_Clear(tstate_); + PyThreadState_DeleteCurrent(); + // If `prev` is tstate_ itself, the user destroyed this object while it was active via a + // subinterpreter_scoped_activate -- a contract violation, but be defensive: do NOT swap back + // to a now-deleted pointer. Leaving the thread with no current interpreter is consistent + // with the cached state having just been destroyed. + if (prev != nullptr && prev != tstate_) { PyThreadState_Swap(prev); } - cache.clear(); } PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 56415ec497..603d8a2920 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -153,56 +153,99 @@ TEST_CASE("Move Subinterpreter") { } # endif -TEST_CASE("Cached Subinterpreter thread state") { +TEST_CASE("Reused Subinterpreter thread state (single interpreter)") { py::subinterpreter sub = py::subinterpreter::create(); - PyThreadState *main_a = nullptr; - PyThreadState *main_b = nullptr; + PyThreadState *first = nullptr; + PyThreadState *second = nullptr; PyThreadState *transient_ts = nullptr; PyThreadState *worker_ts = nullptr; { - py::subinterpreter_scoped_activate a(sub, py::subinterpreter_thread_state::cached); - main_a = PyThreadState_Get(); - py::list(py::module_::import("sys").attr("path")).append(py::str(".")); - } - { - py::subinterpreter_scoped_activate a(sub, py::subinterpreter_thread_state::cached); - main_b = PyThreadState_Get(); + py::subinterpreter_thread_state ts(sub); + + { + py::subinterpreter_scoped_activate guard(ts); + first = PyThreadState_Get(); + py::list(py::module_::import("sys").attr("path")).append(py::str(".")); + } + { + py::subinterpreter_scoped_activate guard(ts); + second = PyThreadState_Get(); + } + + // Same OS thread + same subinterpreter_thread_state => the PyThreadState is reused. + REQUIRE(first != nullptr); + REQUIRE(first == second); + + // The (subinterpreter const&) ctor does not share with the reusable tstate: while `ts` + // is still alive, a transient activation gets a distinct PyThreadState. + { + py::subinterpreter_scoped_activate guard(sub); + transient_ts = PyThreadState_Get(); + } + REQUIRE(transient_ts != first); + + // A different OS thread holds its own subinterpreter_thread_state (both alive + // concurrently => distinct PyThreadState pointers). + { + py::gil_scoped_release nogil; + std::thread([&]() { + py::subinterpreter_thread_state worker_ts_owner(sub); + py::subinterpreter_scoped_activate guard(worker_ts_owner); + worker_ts = PyThreadState_Get(); + // worker_ts_owner is destroyed at scope exit, on the same OS thread that + // constructed it. + }).join(); + } + REQUIRE(worker_ts != nullptr); + REQUIRE(worker_ts != first); + + // ts is still alive here; it will be destructed at the end of this block on this same + // OS thread, deleting its PyThreadState. } - // Same OS thread + same interpreter + cached policy => the PyThreadState is reused. - REQUIRE(main_a != nullptr); - REQUIRE(main_a == main_b); + unsafe_reset_internals_for_single_interpreter(); +} + +TEST_CASE("Reused Subinterpreter thread state (multiple interpreters)") { + // The core multi-subinterpreter use case: one OS thread alternates between two + // subinterpreters and each PyThreadState is preserved across activations. + py::subinterpreter sub_a = py::subinterpreter::create(); + py::subinterpreter sub_b = py::subinterpreter::create(); + + py::subinterpreter_thread_state ts_a(sub_a); + py::subinterpreter_thread_state ts_b(sub_b); + + PyThreadState *a1 = nullptr; + PyThreadState *a2 = nullptr; + PyThreadState *b1 = nullptr; + PyThreadState *b2 = nullptr; - // The default (transient) policy must not reuse the cached state: while the cached state is - // still alive, a transient activation gets a distinct PyThreadState. { - py::subinterpreter_scoped_activate a(sub); - transient_ts = PyThreadState_Get(); + py::subinterpreter_scoped_activate guard(ts_a); + a1 = PyThreadState_Get(); } - REQUIRE(transient_ts != main_a); - - // A different OS thread gets its own cached state (both alive => distinct pointers). { - py::gil_scoped_release nogil; - std::thread([&]() { - { - py::subinterpreter_scoped_activate a(sub, py::subinterpreter_thread_state::cached); - worker_ts = PyThreadState_Get(); - } - // The owning thread releases its own cached state before exiting. - sub.release_cached_thread_state(); - }).join(); + py::subinterpreter_scoped_activate guard(ts_b); + b1 = PyThreadState_Get(); + } + { + py::subinterpreter_scoped_activate guard(ts_a); + a2 = PyThreadState_Get(); + } + { + py::subinterpreter_scoped_activate guard(ts_b); + b2 = PyThreadState_Get(); } - REQUIRE(worker_ts != nullptr); - REQUIRE(worker_ts != main_a); - - // Release this thread's cached state; a second release is a safe no-op, and release_all on a - // now-empty cache must not crash. - sub.release_cached_thread_state(); - sub.release_cached_thread_state(); - py::subinterpreter::release_all_cached_thread_states(); + + REQUIRE(a1 != nullptr); + REQUIRE(b1 != nullptr); + // Identity is preserved across activations for each interpreter independently. + REQUIRE(a1 == a2); + REQUIRE(b1 == b2); + // And the two interpreters have distinct thread states (both alive => reliable comparison). + REQUIRE(a1 != b1); unsafe_reset_internals_for_single_interpreter(); } From bf11628a85c683dff64a00607fa2d3eceb3b8368 Mon Sep 17 00:00:00 2001 From: Yongming Wang Date: Sun, 24 May 2026 23:20:54 -0700 Subject: [PATCH 4/5] fix(subinterpreter): address review on #6073 (same-thread checks, test scoping) Per @b-pass's review: - ~subinterpreter_thread_state(): add a PYBIND11_DETAILED_ERROR_MESSAGES- guarded check that destruction happens on the OS thread that created the PyThreadState (same PyThread_get_thread_native_id pattern as ~subinterpreter), failing with pybind11_fail otherwise. - subinterpreter_scoped_activate(subinterpreter_thread_state &): add the matching DETAILED_ERROR_MESSAGES check that activation happens on the creating OS thread, enforcing the newly documented rule. - docs: document that activating a subinterpreter_thread_state on another OS thread is illegal. - tests: keep each subinterpreter (and its subinterpreter_thread_state) in an enclosing scope so destruction order is thread-state -> subinterpreter -> unsafe_reset_internals_for_single_interpreter(). The previous top-level declarations ran the reset while the subinterpreters were still alive, which is the likely cause of the CI crashes. Co-Authored-By: Claude Opus 4.7 --- docs/advanced/embedding.rst | 3 + include/pybind11/subinterpreter.h | 27 ++++ tests/test_with_catch/test_subinterpreter.cpp | 142 ++++++++++-------- 3 files changed, 107 insertions(+), 65 deletions(-) diff --git a/docs/advanced/embedding.rst b/docs/advanced/embedding.rst index 9bd56889fc..563f97c47c 100644 --- a/docs/advanced/embedding.rst +++ b/docs/advanced/embedding.rst @@ -407,6 +407,9 @@ also exist for the same sub-interpreter on the same thread. ``PyThreadState`` is bound to its creating thread; deleting it on another thread is undefined behavior. Holding the object as a ``thread_local`` satisfies this automatically. + - It must only be activated (with :class:`subinterpreter_scoped_activate`) + on the **same OS thread** that constructed it. Activating it on a + different thread is illegal. - It must be destroyed while its sub-interpreter is still alive. - It must **not** be destroyed while a :class:`subinterpreter_scoped_activate` referring to it is alive — the activator holds a reference into it. diff --git a/include/pybind11/subinterpreter.h b/include/pybind11/subinterpreter.h index 2b4a72552f..def101ff62 100644 --- a/include/pybind11/subinterpreter.h +++ b/include/pybind11/subinterpreter.h @@ -364,6 +364,20 @@ inline subinterpreter_scoped_activate::subinterpreter_scoped_activate( return; } +#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + { + // A PyThreadState is bound to its creating OS thread; it may only be activated there. + bool same_thread = true; +# ifdef PY_HAVE_THREAD_NATIVE_ID + same_thread = PyThread_get_thread_native_id() == ts.tstate_->native_thread_id; +# endif + if (!same_thread) { + pybind11_fail("subinterpreter_scoped_activate: a subinterpreter_thread_state must be " + "activated on the same OS thread that constructed it!"); + } + } +#endif + tstate_ = ts.tstate_; borrowed_ = true; @@ -417,6 +431,19 @@ inline subinterpreter_thread_state::~subinterpreter_thread_state() { if (tstate_ == nullptr) { return; } +#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + { + // A PyThreadState must be cleared and deleted on the OS thread that created it. + bool same_thread = true; +# ifdef PY_HAVE_THREAD_NATIVE_ID + same_thread = PyThread_get_thread_native_id() == tstate_->native_thread_id; +# endif + if (!same_thread) { + pybind11_fail("~subinterpreter_thread_state: must be destroyed on the same OS thread " + "that constructed it!"); + } + } +#endif // The PyThreadState must be made current to be cleared and deleted on the owning OS thread. // Swap it in (which acquires the subinterpreter's GIL), clear+delete, then restore whatever // was active before. diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 603d8a2920..3af100f2a9 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -154,55 +154,61 @@ TEST_CASE("Move Subinterpreter") { # endif TEST_CASE("Reused Subinterpreter thread state (single interpreter)") { - py::subinterpreter sub = py::subinterpreter::create(); - PyThreadState *first = nullptr; PyThreadState *second = nullptr; PyThreadState *transient_ts = nullptr; PyThreadState *worker_ts = nullptr; + // The subinterpreter is kept in this enclosing scope so that every + // subinterpreter_thread_state is destroyed first, then the subinterpreter, and only then + // unsafe_reset_internals_for_single_interpreter() runs (after the scope closes). { - py::subinterpreter_thread_state ts(sub); + py::subinterpreter sub = py::subinterpreter::create(); { - py::subinterpreter_scoped_activate guard(ts); - first = PyThreadState_Get(); - py::list(py::module_::import("sys").attr("path")).append(py::str(".")); - } - { - py::subinterpreter_scoped_activate guard(ts); - second = PyThreadState_Get(); - } + py::subinterpreter_thread_state ts(sub); - // Same OS thread + same subinterpreter_thread_state => the PyThreadState is reused. - REQUIRE(first != nullptr); - REQUIRE(first == second); + { + py::subinterpreter_scoped_activate guard(ts); + first = PyThreadState_Get(); + py::list(py::module_::import("sys").attr("path")).append(py::str(".")); + } + { + py::subinterpreter_scoped_activate guard(ts); + second = PyThreadState_Get(); + } - // The (subinterpreter const&) ctor does not share with the reusable tstate: while `ts` - // is still alive, a transient activation gets a distinct PyThreadState. - { - py::subinterpreter_scoped_activate guard(sub); - transient_ts = PyThreadState_Get(); - } - REQUIRE(transient_ts != first); + // Same OS thread + same subinterpreter_thread_state => the PyThreadState is reused. + REQUIRE(first != nullptr); + REQUIRE(first == second); - // A different OS thread holds its own subinterpreter_thread_state (both alive - // concurrently => distinct PyThreadState pointers). - { - py::gil_scoped_release nogil; - std::thread([&]() { - py::subinterpreter_thread_state worker_ts_owner(sub); - py::subinterpreter_scoped_activate guard(worker_ts_owner); - worker_ts = PyThreadState_Get(); - // worker_ts_owner is destroyed at scope exit, on the same OS thread that - // constructed it. - }).join(); - } - REQUIRE(worker_ts != nullptr); - REQUIRE(worker_ts != first); + // The (subinterpreter const&) ctor does not share with the reusable tstate: while + // `ts` is still alive, a transient activation gets a distinct PyThreadState. + { + py::subinterpreter_scoped_activate guard(sub); + transient_ts = PyThreadState_Get(); + } + REQUIRE(transient_ts != first); + + // A different OS thread holds its own subinterpreter_thread_state (both alive + // concurrently => distinct PyThreadState pointers). + { + py::gil_scoped_release nogil; + std::thread([&]() { + py::subinterpreter_thread_state worker_ts_owner(sub); + py::subinterpreter_scoped_activate guard(worker_ts_owner); + worker_ts = PyThreadState_Get(); + // worker_ts_owner is destroyed at scope exit, on the same OS thread that + // constructed it. + }).join(); + } + REQUIRE(worker_ts != nullptr); + REQUIRE(worker_ts != first); - // ts is still alive here; it will be destructed at the end of this block on this same - // OS thread, deleting its PyThreadState. + // ts is destructed at the end of this block on this same OS thread (deleting its + // PyThreadState), while `sub` is still alive. + } + // sub is destructed at the end of this block. } unsafe_reset_internals_for_single_interpreter(); @@ -211,41 +217,47 @@ TEST_CASE("Reused Subinterpreter thread state (single interpreter)") { TEST_CASE("Reused Subinterpreter thread state (multiple interpreters)") { // The core multi-subinterpreter use case: one OS thread alternates between two // subinterpreters and each PyThreadState is preserved across activations. - py::subinterpreter sub_a = py::subinterpreter::create(); - py::subinterpreter sub_b = py::subinterpreter::create(); - - py::subinterpreter_thread_state ts_a(sub_a); - py::subinterpreter_thread_state ts_b(sub_b); - PyThreadState *a1 = nullptr; PyThreadState *a2 = nullptr; PyThreadState *b1 = nullptr; PyThreadState *b2 = nullptr; + // Everything is kept in this enclosing scope. Destruction order at the closing brace is + // ts_b, ts_a, sub_b, sub_a -- i.e. each subinterpreter_thread_state is destroyed before its + // subinterpreter -- and unsafe_reset_internals_for_single_interpreter() only runs afterwards. { - py::subinterpreter_scoped_activate guard(ts_a); - a1 = PyThreadState_Get(); - } - { - py::subinterpreter_scoped_activate guard(ts_b); - b1 = PyThreadState_Get(); - } - { - py::subinterpreter_scoped_activate guard(ts_a); - a2 = PyThreadState_Get(); - } - { - py::subinterpreter_scoped_activate guard(ts_b); - b2 = PyThreadState_Get(); - } + py::subinterpreter sub_a = py::subinterpreter::create(); + py::subinterpreter sub_b = py::subinterpreter::create(); + + py::subinterpreter_thread_state ts_a(sub_a); + py::subinterpreter_thread_state ts_b(sub_b); - REQUIRE(a1 != nullptr); - REQUIRE(b1 != nullptr); - // Identity is preserved across activations for each interpreter independently. - REQUIRE(a1 == a2); - REQUIRE(b1 == b2); - // And the two interpreters have distinct thread states (both alive => reliable comparison). - REQUIRE(a1 != b1); + { + py::subinterpreter_scoped_activate guard(ts_a); + a1 = PyThreadState_Get(); + } + { + py::subinterpreter_scoped_activate guard(ts_b); + b1 = PyThreadState_Get(); + } + { + py::subinterpreter_scoped_activate guard(ts_a); + a2 = PyThreadState_Get(); + } + { + py::subinterpreter_scoped_activate guard(ts_b); + b2 = PyThreadState_Get(); + } + + REQUIRE(a1 != nullptr); + REQUIRE(b1 != nullptr); + // Identity is preserved across activations for each interpreter independently. + REQUIRE(a1 == a2); + REQUIRE(b1 == b2); + // And the two interpreters have distinct thread states (both alive => reliable + // comparison). + REQUIRE(a1 != b1); + } unsafe_reset_internals_for_single_interpreter(); } From 63f88c32d15c617d5be6c4cd27d9788a4a6fd120 Mon Sep 17 00:00:00 2001 From: Yongming Wang Date: Mon, 25 May 2026 00:19:07 -0700 Subject: [PATCH 5/5] docs: fix codespell (re-used -> reused) in embedding.rst Co-Authored-By: Claude Opus 4.7 --- docs/advanced/embedding.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/embedding.rst b/docs/advanced/embedding.rst index 563f97c47c..46a1c457c6 100644 --- a/docs/advanced/embedding.rst +++ b/docs/advanced/embedding.rst @@ -375,7 +375,7 @@ between activations: { py::subinterpreter_scoped_activate guard(ts); - // The same PyThreadState is re-used; its per-thread interpreter state + // The same PyThreadState is reused; its per-thread interpreter state // is preserved across activations. }