From c32687360e26eec21f1f5ec0d4fc6f454109b9fd Mon Sep 17 00:00:00 2001
From: William Candillon <wcandillon@gmail.com>
Date: Tue, 2 Jun 2026 16:11:57 +0200
Subject: [PATCH 1/4] :wrench:

---
 README.md                                     | 10 +---
 apps/example/ios/Podfile.lock                 |  4 +-
 apps/example/src/CanvasAPI/CanvasAPI.tsx      |  2 -
 apps/example/src/ComputeToys/engine/index.ts  |  1 -
 .../ImportExternalTexture.tsx                 |  1 -
 apps/example/src/Reanimated/Reanimated.tsx    |  1 -
 .../SharedTextureMemory.tsx                   |  1 -
 .../StorageBufferVertices.tsx                 |  2 -
 apps/example/src/ThreeJS/Backdrop.tsx         |  1 -
 apps/example/src/ThreeJS/Cube.tsx             |  1 -
 apps/example/src/ThreeJS/Helmet.tsx           |  1 -
 apps/example/src/ThreeJS/InstancedMesh.tsx    |  1 -
 apps/example/src/ThreeJS/PostProcessing.tsx   |  1 -
 apps/example/src/ThreeJS/Retargeting.tsx      |  1 -
 .../src/ThreeJS/components/FiberCanvas.tsx    |  1 -
 apps/example/src/Triangle/HelloTriangle.tsx   |  2 -
 .../src/Triangle/HelloTriangleMSAA.tsx        |  1 -
 .../example/src/VisionCamera/VisionCamera.tsx |  1 -
 apps/example/src/components/Texture.tsx       |  1 -
 apps/example/src/components/useWebGPU.ts      |  1 -
 docs/refactor-async-present-plan.md           | 46 +++++++++++++++-
 packages/webgpu/README.md                     | 10 +---
 packages/webgpu/android/CMakeLists.txt        |  1 +
 packages/webgpu/android/cpp/cpp-adapter.cpp   | 54 +++++++++++++++++++
 packages/webgpu/apple/MetalView.mm            |  4 ++
 packages/webgpu/apple/WebGPUModule.mm         |  7 +++
 packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h  | 29 +++++++++-
 packages/webgpu/cpp/rnwgpu/api/GPU.h          |  1 +
 .../cpp/rnwgpu/api/GPUCanvasContext.cpp       | 35 ++++++------
 .../webgpu/cpp/rnwgpu/api/GPUCanvasContext.h  |  5 +-
 packages/webgpu/src/Canvas.tsx                |  6 +--
 packages/webgpu/src/Offscreen.ts              |  4 --
 packages/webgpu/src/WebPolyfillGPUModule.ts   |  5 +-
 packages/webgpu/src/types.ts                  |  6 +--
 34 files changed, 172 insertions(+), 76 deletions(-)

diff --git a/README.md b/README.md
index 8eeb1cba1..d7415053b 100644
--- a/README.md
+++ b/README.md
@@ -128,8 +128,6 @@ export function HelloTriangle() {
       passEncoder.end();
 
       device.queue.submit([commandEncoder.finish()]);
-
-      context.present();
     };
     helloTriangle();
   }, [ref]);
@@ -174,15 +172,13 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-In React Native, we want to keep frame presentation as a manual operation as we plan to provide more advanced rendering options that are React Native specific.  
-This means that when you are ready to present a frame, you need to call `present` on the context.
+Frame presentation is automatic. Once you acquire the frame's texture with `context.getCurrentTexture()` and submit your commands, the frame is presented on the next display refresh (driven by a global vsync source: `CADisplayLink` on iOS, `Choreographer` on Android). There is no `present()` call.
 
 ```tsx
 // draw
 // submit to the queue
 device.queue.submit([commandEncoder.finish()]);
-// This method is React Native only
-context.present();
+// The frame is presented automatically on the next vsync.
 ```
 
 ### Canvas Transparency
@@ -296,7 +292,6 @@ const render = () => {
 
   // Release the surface's access window right after the submit that sampled it.
   externalTexture.destroy();
-  context.present();
 };
 ```
 
@@ -328,7 +323,6 @@ const renderFrame = (device: GPUDevice, context: GPUCanvasContext) => {
   const commandEncoder = device.createCommandEncoder();
   // ... render ...
   device.queue.submit([commandEncoder.finish()]);
-  context.present();
 };
 
 // Initialize WebGPU on main thread, then run on UI thread
diff --git a/apps/example/ios/Podfile.lock b/apps/example/ios/Podfile.lock
index fd5ba968c..b4c5f158a 100644
--- a/apps/example/ios/Podfile.lock
+++ b/apps/example/ios/Podfile.lock
@@ -1924,7 +1924,7 @@ PODS:
     - ReactCommon/turbomodule/core
     - SocketRocket
     - Yoga
-  - react-native-wgpu (0.5.12):
+  - react-native-wgpu (0.5.13):
     - boost
     - DoubleConversion
     - fast_float
@@ -3074,7 +3074,7 @@ SPEC CHECKSUMS:
   React-microtasksnativemodule: 75b6604b667d297292345302cc5bfb6b6aeccc1b
   react-native-safe-area-context: c00143b4823773bba23f2f19f85663ae89ceb460
   react-native-skia: fc73e9bdc46ebb420a98c9c2be29fee80f565e79
-  react-native-wgpu: 274ffec11ee3a082260d9f3d1fb54030a5ca0873
+  react-native-wgpu: 0496e9efeb4c3939ab56371005ede4e1468591d1
   React-NativeModulesApple: 879fbdc5dcff7136abceb7880fe8a2022a1bd7c3
   React-oscompat: 93b5535ea7f7dff46aaee4f78309a70979bdde9d
   React-perflogger: 5536d2df3d18fe0920263466f7b46a56351c0510
diff --git a/apps/example/src/CanvasAPI/CanvasAPI.tsx b/apps/example/src/CanvasAPI/CanvasAPI.tsx
index a9f5c4928..a403c8388 100644
--- a/apps/example/src/CanvasAPI/CanvasAPI.tsx
+++ b/apps/example/src/CanvasAPI/CanvasAPI.tsx
@@ -89,8 +89,6 @@ export const CanvasAPI = () => {
             passEncoder.end();
 
             device.queue.submit([commandEncoder.finish()]);
-
-            context.present();
           })()
         }
         title="check surface"
diff --git a/apps/example/src/ComputeToys/engine/index.ts b/apps/example/src/ComputeToys/engine/index.ts
index f0fa08f07..8db2562ad 100644
--- a/apps/example/src/ComputeToys/engine/index.ts
+++ b/apps/example/src/ComputeToys/engine/index.ts
@@ -398,7 +398,6 @@ fn passSampleLevelBilinearRepeat(pass_index: int, uv: float2, lod: float) -> flo
 
       // Submit command buffer
       this.device.queue.submit([encoder.finish()]);
-      this.surface!.present();
 
       // Update frame counter
       this.bindings!.time.host.frame += 1;
diff --git a/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx b/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
index f8399ee8a..7c973e03f 100644
--- a/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
+++ b/apps/example/src/ImportExternalTexture/ImportExternalTexture.tsx
@@ -247,7 +247,6 @@ export const ImportExternalTexture = () => {
       // Now that the work sampling it has been submitted, end the external
       // texture's access window so the frame's surface is released promptly.
       externalTex?.destroy();
-      context.present();
       rafRef.current = requestAnimationFrame(render);
     };
     rafRef.current = requestAnimationFrame(render);
diff --git a/apps/example/src/Reanimated/Reanimated.tsx b/apps/example/src/Reanimated/Reanimated.tsx
index 505296565..2f8b5e5cb 100644
--- a/apps/example/src/Reanimated/Reanimated.tsx
+++ b/apps/example/src/Reanimated/Reanimated.tsx
@@ -79,7 +79,6 @@ export const webGPUDemo = (
 
     device.queue.submit([commandEncoder.finish()]);
 
-    context.present();
     if (runAnimation.value) {
       requestAnimationFrame(frame);
     }
diff --git a/apps/example/src/SharedTextureMemory/SharedTextureMemory.tsx b/apps/example/src/SharedTextureMemory/SharedTextureMemory.tsx
index b5627cc43..197657460 100644
--- a/apps/example/src/SharedTextureMemory/SharedTextureMemory.tsx
+++ b/apps/example/src/SharedTextureMemory/SharedTextureMemory.tsx
@@ -268,7 +268,6 @@ export const SharedTextureMemory = () => {
       }
       pass.end();
       device.queue.submit([encoder.finish()]);
-      context.present();
       rafRef.current = requestAnimationFrame(render);
     };
     rafRef.current = requestAnimationFrame(render);
diff --git a/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx b/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
index 907264638..b1906cf74 100644
--- a/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
+++ b/apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
@@ -185,8 +185,6 @@ export function StorageBufferVertices() {
 
     const commandBuffer = encoder.finish();
     device.queue.submit([commandBuffer]);
-    // eslint-disable-next-line @typescript-eslint/no-explicit-any
-    (context as any).present();
   });
 
   return (
diff --git a/apps/example/src/ThreeJS/Backdrop.tsx b/apps/example/src/ThreeJS/Backdrop.tsx
index 8ed2a8c91..113325b9d 100644
--- a/apps/example/src/ThreeJS/Backdrop.tsx
+++ b/apps/example/src/ThreeJS/Backdrop.tsx
@@ -150,7 +150,6 @@ export const Backdrop = () => {
       }
 
       renderer.render(scene, camera);
-      context!.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/Cube.tsx b/apps/example/src/ThreeJS/Cube.tsx
index d3e9707b5..ea3fe0f23 100644
--- a/apps/example/src/ThreeJS/Cube.tsx
+++ b/apps/example/src/ThreeJS/Cube.tsx
@@ -31,7 +31,6 @@ export const Cube = () => {
       mesh.rotation.y = time / 1000;
 
       renderer.render(scene, camera);
-      context.present();
     }
     renderer.setAnimationLoop(animate);
     return () => {
diff --git a/apps/example/src/ThreeJS/Helmet.tsx b/apps/example/src/ThreeJS/Helmet.tsx
index be7cb626f..70720d360 100644
--- a/apps/example/src/ThreeJS/Helmet.tsx
+++ b/apps/example/src/ThreeJS/Helmet.tsx
@@ -49,7 +49,6 @@ export const Helmet = () => {
     function animate() {
       animateCamera();
       renderer.render(scene, camera);
-      context!.present();
     }
 
     return () => {
diff --git a/apps/example/src/ThreeJS/InstancedMesh.tsx b/apps/example/src/ThreeJS/InstancedMesh.tsx
index 3f60631de..5b7c7ca4d 100644
--- a/apps/example/src/ThreeJS/InstancedMesh.tsx
+++ b/apps/example/src/ThreeJS/InstancedMesh.tsx
@@ -59,7 +59,6 @@ export const InstancedMesh = () => {
 
     function animate() {
       render();
-      context!.present();
     }
 
     function render() {
diff --git a/apps/example/src/ThreeJS/PostProcessing.tsx b/apps/example/src/ThreeJS/PostProcessing.tsx
index d94ef1728..0c2980501 100644
--- a/apps/example/src/ThreeJS/PostProcessing.tsx
+++ b/apps/example/src/ThreeJS/PostProcessing.tsx
@@ -72,7 +72,6 @@ export const PostProcessing = () => {
         mixer.update(delta);
       }
       postProcessing.render();
-      context!.present();
     }
     return () => {
       renderer.setAnimationLoop(null);
diff --git a/apps/example/src/ThreeJS/Retargeting.tsx b/apps/example/src/ThreeJS/Retargeting.tsx
index c25601885..8b8dd9a29 100644
--- a/apps/example/src/ThreeJS/Retargeting.tsx
+++ b/apps/example/src/ThreeJS/Retargeting.tsx
@@ -302,7 +302,6 @@ export const Retargeting = () => {
       source.mixer.update(delta);
       mixer.update(delta);
       renderer.render(scene, camera);
-      context.present();
     });
 
     return () => {
diff --git a/apps/example/src/ThreeJS/components/FiberCanvas.tsx b/apps/example/src/ThreeJS/components/FiberCanvas.tsx
index 91b699553..92b928987 100644
--- a/apps/example/src/ThreeJS/components/FiberCanvas.tsx
+++ b/apps/example/src/ThreeJS/components/FiberCanvas.tsx
@@ -66,7 +66,6 @@ export const FiberCanvas = ({
         const renderFrame = state.gl.render.bind(state.gl);
         state.gl.render = (s: THREE.Scene, c: THREE.Camera) => {
           renderFrame(s, c);
-          context?.present();
         };
       },
     });
diff --git a/apps/example/src/Triangle/HelloTriangle.tsx b/apps/example/src/Triangle/HelloTriangle.tsx
index 3e28d6c12..caeb560b3 100644
--- a/apps/example/src/Triangle/HelloTriangle.tsx
+++ b/apps/example/src/Triangle/HelloTriangle.tsx
@@ -77,8 +77,6 @@ export function HelloTriangle() {
       passEncoder.end();
 
       device.queue.submit([commandEncoder.finish()]);
-
-      context.present();
     })();
   }, [ref]);
 
diff --git a/apps/example/src/Triangle/HelloTriangleMSAA.tsx b/apps/example/src/Triangle/HelloTriangleMSAA.tsx
index 5d66983d5..b9518fbe9 100644
--- a/apps/example/src/Triangle/HelloTriangleMSAA.tsx
+++ b/apps/example/src/Triangle/HelloTriangleMSAA.tsx
@@ -87,7 +87,6 @@ export function HelloTriangleMSAA() {
       }
 
       frame();
-      context.present();
     })();
   }, [ref]);
 
diff --git a/apps/example/src/VisionCamera/VisionCamera.tsx b/apps/example/src/VisionCamera/VisionCamera.tsx
index c4adcfaa0..cba2d2948 100644
--- a/apps/example/src/VisionCamera/VisionCamera.tsx
+++ b/apps/example/src/VisionCamera/VisionCamera.tsx
@@ -617,7 +617,6 @@ const CameraView = () => {
           // access window now to release the camera frame's surface promptly
           // (don't wait for GC, which would starve the frame buffer pool).
           externalTex.destroy();
-          context.present();
         } finally {
           videoFrame.release();
         }
diff --git a/apps/example/src/components/Texture.tsx b/apps/example/src/components/Texture.tsx
index d9e689b41..5bd82a911 100644
--- a/apps/example/src/components/Texture.tsx
+++ b/apps/example/src/components/Texture.tsx
@@ -145,7 +145,6 @@ export const Texture = ({ texture, style, device }: GPUTextureProps) => {
     renderPass.end();
 
     device.queue.submit([commandEncoder.finish()]);
-    context.present();
   }, [device, state, texture, ref]);
   return <Canvas ref={ref} style={style} />;
 };
diff --git a/apps/example/src/components/useWebGPU.ts b/apps/example/src/components/useWebGPU.ts
index ac8a631ac..1a399aafe 100644
--- a/apps/example/src/components/useWebGPU.ts
+++ b/apps/example/src/components/useWebGPU.ts
@@ -57,7 +57,6 @@ export const useWebGPU = (scene: Scene) => {
         const render = () => {
           const timestamp = Date.now();
           renderScene(timestamp);
-          context.present();
           animationFrameId.current = requestAnimationFrame(render);
         };
 
diff --git a/docs/refactor-async-present-plan.md b/docs/refactor-async-present-plan.md
index e69706534..e4d38b802 100644
--- a/docs/refactor-async-present-plan.md
+++ b/docs/refactor-async-present-plan.md
@@ -244,7 +244,7 @@ the full `react-native-wgpu` native lib **compiles and links** for `arm64-v8a` (
 `cpplint` clean (project filters); `clang-format` (pinned 15.0.0) applied; `yarn tsc` passes
 (no TS changed). On-device runtime behaviour (frame pacing, zero idle CPU) is Phase 4.
 
-**Phase 2 — Auto-present + remove `present()`**
+**Phase 2 — Auto-present + remove `present()`** — **DONE**
 - Add `FrameDriver` (iOS `CADisplayLink`, Android `AChoreographer`); wire
   `getCurrentTexture` → register; vsync → dispatch present to owning runtime.
 - Remove `GPUCanvasContext::present` (`api/GPUCanvasContext.h:50,58`, `.cpp:56-65`) and
@@ -252,6 +252,50 @@ the full `react-native-wgpu` native lib **compiles and links** for `arm64-v8a` (
 - JS: drop `present` from `RNCanvasContext` (`src/Canvas.tsx:22-24`, `src/types.ts`).
 - Migrate all 16 example / `useWebGPU` call sites + `README.md` + `packages/webgpu/README.md`.
 
+### Phase 2 — what shipped (branch `claude/keen-darwin-xeywa`)
+New files:
+- `cpp/rnwgpu/FrameDriver.{h,cpp}` — global vsync auto-present coordinator. `requestPresent`
+  (from `getCurrentTexture`, JS thread) coalesces per `contextId`; `onVSync` (UI thread)
+  dispatches each pending surface's present onto its owning runtime's `RuntimeScheduler`
+  (`surface->presentFrame()`). Request-driven: starts the platform vsync on first request,
+  stops after `kMaxIdleFrames` (3) idle frames → zero idle CPU.
+- `apple/WebGPUFrameDriver.{h,mm}` — iOS/tvOS `CADisplayLink` on the main run loop (paused
+  toggled by start/stop). macOS uses `NSScreen.displayLinkWithTarget:` on 14+, else an
+  `NSTimer` fallback. Selector → `FrameDriver::onVSync()`.
+- `android/.../com/webgpu/WebGPUFrameDriver.java` — main-thread `Choreographer` driver;
+  `doFrame` → static `nativeOnVSync()` JNI → `FrameDriver::onVSync()`, reposts while running.
+
+Wiring:
+- `SurfaceInfo::present()` → `presentFrame()` (Apple `WaitForCommandsToBeScheduled` + Present,
+  no-op offscreen); added `SurfaceInfo::hasSurface()`. Metal extern moved to `SurfaceRegistry.h`.
+- `GPU::getContext()` re-exposes the per-runtime `RuntimeContext` (so the canvas can reach its
+  scheduler). `GPUCanvasContext` stores `_contextId`, registers the present in
+  `getCurrentTexture` (and now sets the canvas client size there), and dropped `present()` +
+  its JS binding.
+- iOS `WebGPUModule install` and Android `initializeNative` register `setPlatformVSync`. View
+  teardown (`MetalView dealloc`, Android `onSurfaceDestroy`) calls `FrameDriver::cancelPresent`.
+- JS: `RNCanvasContext` is now just `GPUCanvasContext` (`src/Canvas.tsx`, `src/types.ts`);
+  removed the no-op `present` from `Offscreen.ts` and `WebPolyfillGPUModule.ts`. 18 example
+  call sites (the plan's 16 + `VisionCamera`, `ImportExternalTexture`) and both READMEs migrated.
+
+Decisions / deviations:
+1. **Android vsync = Java `Choreographer` + JNI** (not pure NDK `AChoreographer`), chosen for
+   robustness — pure NDK needs a JNI hop to a Looper thread to bootstrap anyway. Confirmed with
+   the user.
+2. **`present()` hard-removed** (breaking), confirmed with the user.
+3. **Owning-runtime caveat (→ Phase 3):** `getCurrentTexture` currently dispatches present via
+   the **main** runtime's scheduler (`_gpu->getContext()`). Correct for main-JS rendering. The
+   Reanimated example renders on the **UI (worklet) runtime**, so its present is migrated (call
+   removed) but auto-present won't target the correct thread until Phase 3 tags the present with
+   the *calling* runtime and gives worklet runtimes their own `RuntimeScheduler`. Expect the
+   Reanimated/Dedicated examples to be visually broken between Phase 2 and Phase 3.
+
+Validation (local): `react-native-wgpu` native lib **compiles and links** for `arm64-v8a`
+(ninja, CMake picked up `FrameDriver.cpp`); `cpplint` clean; `clang-format` applied; `yarn tsc`
+and `yarn lint` pass for both `packages/webgpu` and `apps/example`. iOS `.mm` and the Java
+driver are not compiled locally (no iOS/gradle build run here) — review-only; needs a device
+build. On-device frame pacing / zero-idle-CPU verification is Phase 4.
+
 **Phase 3 — First-class worklet runtimes**
 - Worklet-runtime `RuntimeScheduler` impl (per Spike 1); verify auto-present dispatch on UI +
   dedicated runtimes; update `apps/example/src/Reanimated/Reanimated.tsx` (drop `present()`,
diff --git a/packages/webgpu/README.md b/packages/webgpu/README.md
index 8eeb1cba1..d7415053b 100644
--- a/packages/webgpu/README.md
+++ b/packages/webgpu/README.md
@@ -128,8 +128,6 @@ export function HelloTriangle() {
       passEncoder.end();
 
       device.queue.submit([commandEncoder.finish()]);
-
-      context.present();
     };
     helloTriangle();
   }, [ref]);
@@ -174,15 +172,13 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-In React Native, we want to keep frame presentation as a manual operation as we plan to provide more advanced rendering options that are React Native specific.  
-This means that when you are ready to present a frame, you need to call `present` on the context.
+Frame presentation is automatic. Once you acquire the frame's texture with `context.getCurrentTexture()` and submit your commands, the frame is presented on the next display refresh (driven by a global vsync source: `CADisplayLink` on iOS, `Choreographer` on Android). There is no `present()` call.
 
 ```tsx
 // draw
 // submit to the queue
 device.queue.submit([commandEncoder.finish()]);
-// This method is React Native only
-context.present();
+// The frame is presented automatically on the next vsync.
 ```
 
 ### Canvas Transparency
@@ -296,7 +292,6 @@ const render = () => {
 
   // Release the surface's access window right after the submit that sampled it.
   externalTexture.destroy();
-  context.present();
 };
 ```
 
@@ -328,7 +323,6 @@ const renderFrame = (device: GPUDevice, context: GPUCanvasContext) => {
   const commandEncoder = device.createCommandEncoder();
   // ... render ...
   device.queue.submit([commandEncoder.finish()]);
-  context.present();
 };
 
 // Initialize WebGPU on main thread, then run on UI thread
diff --git a/packages/webgpu/android/CMakeLists.txt b/packages/webgpu/android/CMakeLists.txt
index 50756e72e..51005acdc 100644
--- a/packages/webgpu/android/CMakeLists.txt
+++ b/packages/webgpu/android/CMakeLists.txt
@@ -47,6 +47,7 @@ add_library(${PACKAGE_NAME} SHARED
     ../cpp/rnwgpu/api/GPUComputePipeline.cpp
     ../cpp/rnwgpu/api/GPUCanvasContext.cpp
     ../cpp/rnwgpu/RNWebGPUManager.cpp
+    ../cpp/rnwgpu/FrameDriver.cpp
     ../cpp/jsi/Promise.cpp
     ../cpp/jsi/RuntimeLifecycleMonitor.cpp
     ../cpp/jsi/RuntimeAwareCache.cpp
diff --git a/packages/webgpu/android/cpp/cpp-adapter.cpp b/packages/webgpu/android/cpp/cpp-adapter.cpp
index 2a441c218..4f0ba61d3 100644
--- a/packages/webgpu/android/cpp/cpp-adapter.cpp
+++ b/packages/webgpu/android/cpp/cpp-adapter.cpp
@@ -10,6 +10,7 @@
 #include <webgpu/webgpu_cpp.h>
 
 #include "AndroidPlatformContext.h"
+#include "FrameDriver.h"
 #include "GPUCanvasContext.h"
 #include "RNWebGPUManager.h"
 
@@ -17,6 +18,37 @@
 
 std::shared_ptr<rnwgpu::RNWebGPUManager> manager;
 
+// JNI handles for driving the vsync source (com.webgpu.WebGPUFrameDriver),
+// cached on the JNI thread in initializeNative (which has the app classloader).
+static JavaVM *gJavaVM = nullptr;
+static jclass gFrameDriverClass = nullptr;
+static jmethodID gFrameDriverStart = nullptr;
+static jmethodID gFrameDriverStop = nullptr;
+
+static void callFrameDriver(jmethodID method) {
+  if (gJavaVM == nullptr || gFrameDriverClass == nullptr || method == nullptr) {
+    return;
+  }
+  JNIEnv *env = nullptr;
+  bool attached = false;
+  jint res = gJavaVM->GetEnv(reinterpret_cast<void **>(&env), JNI_VERSION_1_6);
+  if (res == JNI_EDETACHED) {
+    if (gJavaVM->AttachCurrentThread(&env, nullptr) != JNI_OK) {
+      return;
+    }
+    attached = true;
+  } else if (res != JNI_OK) {
+    return;
+  }
+  env->CallStaticVoidMethod(gFrameDriverClass, method);
+  if (env->ExceptionCheck()) {
+    env->ExceptionClear();
+  }
+  if (attached) {
+    gJavaVM->DetachCurrentThread();
+  }
+}
+
 extern "C" JNIEXPORT void JNICALL Java_com_webgpu_WebGPUModule_initializeNative(
     JNIEnv *env, jobject /* this */, jlong jsRuntime,
     jobject jsCallInvokerHolder, jobject blobModule) {
@@ -31,6 +63,27 @@ extern "C" JNIEXPORT void JNICALL Java_com_webgpu_WebGPUModule_initializeNative(
       std::make_shared<rnwgpu::AndroidPlatformContext>(globalBlobModule);
   manager = std::make_shared<rnwgpu::RNWebGPUManager>(runtime, jsCallInvoker,
                                                       platformContext);
+
+  // Cache JNI handles for the Choreographer-based vsync source and register it
+  // with the FrameDriver to drive auto-present (replaces context.present()).
+  env->GetJavaVM(&gJavaVM);
+  jclass localCls = env->FindClass("com/webgpu/WebGPUFrameDriver");
+  if (localCls != nullptr) {
+    gFrameDriverClass = reinterpret_cast<jclass>(env->NewGlobalRef(localCls));
+    gFrameDriverStart =
+        env->GetStaticMethodID(gFrameDriverClass, "start", "()V");
+    gFrameDriverStop = env->GetStaticMethodID(gFrameDriverClass, "stop", "()V");
+    env->DeleteLocalRef(localCls);
+  }
+  rnwgpu::FrameDriver::getInstance().setPlatformVSync(
+      [] { callFrameDriver(gFrameDriverStart); },
+      [] { callFrameDriver(gFrameDriverStop); });
+}
+
+extern "C" JNIEXPORT void JNICALL
+Java_com_webgpu_WebGPUFrameDriver_nativeOnVSync(JNIEnv * /*env*/,
+                                                jclass /*clazz*/) {
+  rnwgpu::FrameDriver::getInstance().onVSync();
 }
 
 extern "C" JNIEXPORT void JNICALL Java_com_webgpu_WebGPUView_onSurfaceChanged(
@@ -66,6 +119,7 @@ Java_com_webgpu_WebGPUView_switchToOffscreenSurface(JNIEnv *env, jobject thiz,
 
 extern "C" JNIEXPORT void JNICALL Java_com_webgpu_WebGPUView_onSurfaceDestroy(
     JNIEnv *env, jobject thiz, jint contextId) {
+  rnwgpu::FrameDriver::getInstance().cancelPresent(contextId);
   auto &registry = rnwgpu::SurfaceRegistry::getInstance();
   registry.removeSurfaceInfo(contextId);
 }
\ No newline at end of file
diff --git a/packages/webgpu/apple/MetalView.mm b/packages/webgpu/apple/MetalView.mm
index ccff1245c..e617da889 100644
--- a/packages/webgpu/apple/MetalView.mm
+++ b/packages/webgpu/apple/MetalView.mm
@@ -1,6 +1,8 @@
 #import "MetalView.h"
 #import "webgpu/webgpu_cpp.h"
 
+#include "FrameDriver.h"
+
 @implementation MetalView {
   BOOL _isConfigured;
 }
@@ -42,6 +44,8 @@ - (void)update {
 }
 
 - (void)dealloc {
+  // Stop any pending auto-present for this surface before it goes away.
+  rnwgpu::FrameDriver::getInstance().cancelPresent([_contextId intValue]);
   auto &registry = rnwgpu::SurfaceRegistry::getInstance();
   // Remove the surface info from the registry
   registry.removeSurfaceInfo([_contextId intValue]);
diff --git a/packages/webgpu/apple/WebGPUModule.mm b/packages/webgpu/apple/WebGPUModule.mm
index 99580aa14..c4c7224ad 100644
--- a/packages/webgpu/apple/WebGPUModule.mm
+++ b/packages/webgpu/apple/WebGPUModule.mm
@@ -1,6 +1,8 @@
 #import "WebGPUModule.h"
 #include "ApplePlatformContext.h"
+#include "FrameDriver.h"
 #import "GPUCanvasContext.h"
+#import "WebGPUFrameDriver.h"
 
 #import <React/RCTBridge+Private.h>
 #import <React/RCTCallInvoker.h>
@@ -78,6 +80,11 @@ - (void)invalidate {
       std::make_shared<rnwgpu::ApplePlatformContext>();
   webgpuManager = std::make_shared<rnwgpu::RNWebGPUManager>(runtime, jsInvoker,
                                                             platformContext);
+
+  // Drive auto-present from the display's vsync (replaces context.present()).
+  rnwgpu::FrameDriver::getInstance().setPlatformVSync(
+      [] { [WebGPUFrameDriver start]; }, [] { [WebGPUFrameDriver stop]; });
+
   return @true;
 }
 
diff --git a/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h b/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
index 110a45d44..ed098896a 100644
--- a/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
+++ b/packages/webgpu/cpp/rnwgpu/SurfaceRegistry.h
@@ -7,6 +7,12 @@
 
 #include "webgpu/webgpu_cpp.h"
 
+#ifdef __APPLE__
+namespace dawn::native::metal {
+void WaitForCommandsToBeScheduled(WGPUDevice device);
+} // namespace dawn::native::metal
+#endif
+
 namespace rnwgpu {
 
 struct NativeInfo {
@@ -113,7 +119,22 @@ class SurfaceInfo {
     height = newHeight;
   }
 
-  void present() {
+  // Present the current surface texture. Called at the frame boundary from the
+  // owning runtime's JS thread (via FrameDriver), replacing the old manual
+  // present(). No-op when offscreen / unconfigured (no surface).
+  void presentFrame() {
+#ifdef __APPLE__
+    // Ensure command buffers are scheduled before presenting. Read the device
+    // under a shared lock, then wait without holding it (the wait can block).
+    wgpu::Device device;
+    {
+      std::shared_lock<std::shared_mutex> lock(_mutex);
+      device = config.device;
+    }
+    if (device) {
+      dawn::native::metal::WaitForCommandsToBeScheduled(device.Get());
+    }
+#endif
     std::unique_lock<std::shared_mutex> lock(_mutex);
     if (surface) {
       surface.Present();
@@ -131,6 +152,12 @@ class SurfaceInfo {
     }
   }
 
+  // True when an on-screen wgpu::Surface is attached (vs offscreen texture).
+  bool hasSurface() {
+    std::shared_lock<std::shared_mutex> lock(_mutex);
+    return surface != nullptr;
+  }
+
   NativeInfo getNativeInfo() {
     std::shared_lock<std::shared_mutex> lock(_mutex);
     return {.nativeSurface = nativeSurface, .width = width, .height = height};
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPU.h b/packages/webgpu/cpp/rnwgpu/api/GPU.h
index e7dc15caf..b2488d4c7 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPU.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPU.h
@@ -53,6 +53,7 @@ class GPU : public NativeObject<GPU> {
   }
 
   inline const wgpu::Instance get() { return _instance; }
+  inline std::shared_ptr<async::RuntimeContext> getContext() { return _async; }
 
 private:
   wgpu::Instance _instance;
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
index d75eb7b0f..7a2c32886 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
@@ -1,16 +1,9 @@
 #include "GPUCanvasContext.h"
 #include "Convertors.h"
+#include "FrameDriver.h"
 #include "RNWebGPUManager.h"
 #include <memory>
 
-#ifdef __APPLE__
-namespace dawn::native::metal {
-
-void WaitForCommandsToBeScheduled(WGPUDevice device);
-
-}
-#endif
-
 namespace rnwgpu {
 
 void GPUCanvasContext::configure(
@@ -48,20 +41,26 @@ std::shared_ptr<GPUTexture> GPUCanvasContext::getCurrentTexture() {
     _surfaceInfo->reconfigure(width, height);
   }
   auto texture = _surfaceInfo->getCurrentTexture();
-  // Pass reportsMemoryPressure=false to avoid triggering spurious Hermes GC
-  // cycles every frame since the canvas texture doesn't own the buffer.
-  return std::make_shared<GPUTexture>(texture, "", false);
-}
 
-void GPUCanvasContext::present() {
-#ifdef __APPLE__
-  dawn::native::metal::WaitForCommandsToBeScheduled(
-      _surfaceInfo->getDevice().Get());
-#endif
+  // Auto-present: acquiring the current texture schedules a present for this
+  // surface at the next vsync (spec-aligned "update the rendering" after the
+  // frame). Replaces the old explicit context.present(). Offscreen surfaces
+  // have no wgpu::Surface, so skip them (their texture is read back directly).
   auto size = _surfaceInfo->getSize();
   _canvas->setClientWidth(size.width);
   _canvas->setClientHeight(size.height);
-  _surfaceInfo->present();
+  if (_surfaceInfo->hasSurface()) {
+    // Phase 2: dispatch the present on the main runtime (the only runtime that
+    // owns WebGPU rendering today). Phase 3 will tag this with the *calling*
+    // runtime so worklet-runtime rendering (e.g. the Reanimated example)
+    // presents on its own JS thread, preserving Dawn surface thread-affinity.
+    FrameDriver::getInstance().requestPresent(_contextId, _surfaceInfo,
+                                              _gpu->getContext()->scheduler());
+  }
+
+  // Pass reportsMemoryPressure=false to avoid triggering spurious Hermes GC
+  // cycles every frame since the canvas texture doesn't own the buffer.
+  return std::make_shared<GPUTexture>(texture, "", false);
 }
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
index 4b97a7887..2ab5d69c2 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
@@ -26,7 +26,7 @@ class GPUCanvasContext : public NativeObject<GPUCanvasContext> {
 
   GPUCanvasContext(std::shared_ptr<GPU> gpu, int contextId, int width,
                    int height)
-      : NativeObject(CLASS_NAME), _gpu(std::move(gpu)) {
+      : NativeObject(CLASS_NAME), _contextId(contextId), _gpu(std::move(gpu)) {
     _canvas = std::make_shared<Canvas>(nullptr, width, height);
     auto &registry = rnwgpu::SurfaceRegistry::getInstance();
     _surfaceInfo =
@@ -47,7 +47,6 @@ class GPUCanvasContext : public NativeObject<GPUCanvasContext> {
                   &GPUCanvasContext::unconfigure);
     installMethod(runtime, prototype, "getCurrentTexture",
                   &GPUCanvasContext::getCurrentTexture);
-    installMethod(runtime, prototype, "present", &GPUCanvasContext::present);
   }
 
   // TODO: is this ok?
@@ -55,9 +54,9 @@ class GPUCanvasContext : public NativeObject<GPUCanvasContext> {
   void configure(std::shared_ptr<GPUCanvasConfiguration> configuration);
   void unconfigure();
   std::shared_ptr<GPUTexture> getCurrentTexture();
-  void present();
 
 private:
+  int _contextId;
   std::shared_ptr<Canvas> _canvas;
   std::shared_ptr<SurfaceInfo> _surfaceInfo;
   std::shared_ptr<GPU> _gpu;
diff --git a/packages/webgpu/src/Canvas.tsx b/packages/webgpu/src/Canvas.tsx
index 1030f3e38..7c2a47a6e 100644
--- a/packages/webgpu/src/Canvas.tsx
+++ b/packages/webgpu/src/Canvas.tsx
@@ -19,9 +19,9 @@ export interface NativeCanvas {
   clientHeight: number;
 }
 
-export type RNCanvasContext = GPUCanvasContext & {
-  present: () => void;
-};
+// Auto-present (a global vsync FrameDriver) replaces the old manual present();
+// the native context is now just a spec GPUCanvasContext.
+export type RNCanvasContext = GPUCanvasContext;
 
 export interface CanvasRef {
   getContextId: () => number;
diff --git a/packages/webgpu/src/Offscreen.ts b/packages/webgpu/src/Offscreen.ts
index c4e460bb2..6ce2f589c 100644
--- a/packages/webgpu/src/Offscreen.ts
+++ b/packages/webgpu/src/Offscreen.ts
@@ -64,10 +64,6 @@ class GPUOffscreenCanvasContext implements GPUCanvasContext {
     throw new Error("Method not implemented.");
   }
 
-  present() {
-    // Do nothing
-  }
-
   getDevice() {
     if (!this.device) {
       throw new Error("Device is not configured.");
diff --git a/packages/webgpu/src/WebPolyfillGPUModule.ts b/packages/webgpu/src/WebPolyfillGPUModule.ts
index 9dcc1f1c5..04229cd05 100644
--- a/packages/webgpu/src/WebPolyfillGPUModule.ts
+++ b/packages/webgpu/src/WebPolyfillGPUModule.ts
@@ -39,10 +39,7 @@ function makeWebGPUCanvasContext(
     canvas.setAttribute("height", pixelHeight);
   }
 
-  const context = canvas.getContext("webgpu")!;
-  return Object.assign(context, {
-    present: () => {},
-  });
+  return canvas.getContext("webgpu")!;
 }
 
 // @ts-expect-error - polyfill for RNWebGPU native module
diff --git a/packages/webgpu/src/types.ts b/packages/webgpu/src/types.ts
index c03f92b4b..0758c73f4 100644
--- a/packages/webgpu/src/types.ts
+++ b/packages/webgpu/src/types.ts
@@ -8,9 +8,9 @@ export interface NativeCanvas {
   clientHeight: number;
 }
 
-export type RNCanvasContext = GPUCanvasContext & {
-  present: () => void;
-};
+// Auto-present (a global vsync FrameDriver) replaces the old manual present();
+// the native context is now just a spec GPUCanvasContext.
+export type RNCanvasContext = GPUCanvasContext;
 
 export interface CanvasRef {
   getContextId: () => number;

From f5bc1c20b2287ff71c1290e27ada6ed0dc5e4e8b Mon Sep 17 00:00:00 2001
From: William Candillon <wcandillon@gmail.com>
Date: Tue, 2 Jun 2026 17:05:24 +0200
Subject: [PATCH 2/4] :wrench:

---
 docs/refactor-async-present-plan.md           | 74 +++++++++++++++-
 .../java/com/webgpu/WebGPUFrameDriver.java    | 66 ++++++++++++++
 packages/webgpu/apple/WebGPUFrameDriver.h     | 13 +++
 packages/webgpu/apple/WebGPUFrameDriver.mm    | 88 +++++++++++++++++++
 packages/webgpu/cpp/rnwgpu/FrameDriver.cpp    | 81 +++++++++++++++++
 packages/webgpu/cpp/rnwgpu/FrameDriver.h      | 83 +++++++++++++++++
 .../cpp/rnwgpu/api/GPUCanvasContext.cpp       | 51 ++++++++---
 .../webgpu/cpp/rnwgpu/api/GPUCanvasContext.h  | 10 ++-
 8 files changed, 450 insertions(+), 16 deletions(-)
 create mode 100644 packages/webgpu/android/src/main/java/com/webgpu/WebGPUFrameDriver.java
 create mode 100644 packages/webgpu/apple/WebGPUFrameDriver.h
 create mode 100644 packages/webgpu/apple/WebGPUFrameDriver.mm
 create mode 100644 packages/webgpu/cpp/rnwgpu/FrameDriver.cpp
 create mode 100644 packages/webgpu/cpp/rnwgpu/FrameDriver.h

diff --git a/docs/refactor-async-present-plan.md b/docs/refactor-async-present-plan.md
index e4d38b802..65490af29 100644
--- a/docs/refactor-async-present-plan.md
+++ b/docs/refactor-async-present-plan.md
@@ -1,6 +1,6 @@
 # Refactor: event-driven async + auto-present
 
-Status: **Phase 0 complete — all spikes GREEN, ready for Phase 1**
+Status: **Phases 1–3 complete (local build/lint green). Phase 4 (SurfaceRegistry rework) proposed; Phase 5 = on-device validation.**
 Branch: `claude/keen-darwin-xeywa`
 
 This document is the handoff for moving the async + present refactor forward. Phase 0
@@ -296,12 +296,80 @@ and `yarn lint` pass for both `packages/webgpu` and `apps/example`. iOS `.mm` an
 driver are not compiled locally (no iOS/gradle build run here) — review-only; needs a device
 build. On-device frame pacing / zero-idle-CPU verification is Phase 4.
 
-**Phase 3 — First-class worklet runtimes**
+**Phase 3 — First-class worklet runtimes** — **DONE**
 - Worklet-runtime `RuntimeScheduler` impl (per Spike 1); verify auto-present dispatch on UI +
   dedicated runtimes; update `apps/example/src/Reanimated/Reanimated.tsx` (drop `present()`,
   keep its own rAF loop).
 
-**Phase 4 — Validation**
+### Phase 3 — what shipped (branch `claude/keen-darwin-xeywa`)
+Observed after Phase 2: the **UI-runtime** Reanimated example worked (the Reanimated UI runtime
+executes on the **main thread**, so dispatching its present to the main runtime's scheduler
+happened to land on the right thread), but the **dedicated `createWorkletRuntime`** example
+(`Reanimated/DedicatedThread.tsx`, `runOnRuntime`) crashed — its render thread is its own, so a
+main-thread present violated Dawn surface thread-affinity.
+
+**Decision (confirmed with the user): self-scheduled present, no native worklets dependency.**
+Rather than link `react-native-worklets` natively and have the FrameDriver dispatch via
+`WorkletRuntime::schedule` (the original plan / Spike 1 primary), worklet runtimes now schedule
+their own present on their own event loop. This avoids a new native build dependency entirely
+and is fully buildable/validatable locally (it is Spike 1's documented "JS-scheduling"
+contingency).
+
+Implementation (native only; no JS/build-system changes):
+- `GPUCanvasContext::getCurrentTexture` switched to the full-control HostFunction signature
+  (`jsi::Value(rt, thisVal, args, count)`, same pattern as `RNWebGPU::createImageBitmap`) so it
+  learns the **calling** runtime. New `schedulePresent(runtime)`:
+  - **Main runtime** (`RuntimeContext::get(runtime)` is non-null): unchanged — register with the
+    global vsync `FrameDriver` using that runtime's scheduler.
+  - **Any worklet runtime** (no `RuntimeContext` — Reanimated UI/dedicated, Vision Camera frame
+    processors, …): **present-on-next-acquire**. `getCurrentTexture` presents the *previous*
+    frame synchronously (inline, on the calling thread) just before acquiring the next texture;
+    by then the previous frame's submit has happened, and present runs on the same thread that
+    rendered it. This is the natural swapchain boundary and needs no scheduler.
+
+    Why not schedule onto the runtime's own loop: two earlier attempts failed. (1)
+    `queueMicrotask` is **disabled** on worklet runtimes (throws "microtasks are disabled in this
+    runtime"). (2) `setImmediate`/`setTimeout` exist but route through the runtime's `EventLoop`
+    `AsyncQueue`, which for **Vision Camera** is a custom `NativeThreadAsyncQueue` that hops back
+    through JNI (`fbjni Environment::current()`) and **crashes** when pushed from a
+    non-JVM-attached thread. Present-on-next-acquire avoids the runtime's task queue entirely.
+    Trade-off: one frame of latency, and a worklet that renders exactly once would not present
+    its single frame (continuous loops — rAF, camera frames — are unaffected; the main runtime's
+    one-shot case is covered by the FrameDriver).
+- `Reanimated.tsx` already had `present()` removed in Phase 2; `DedicatedThread.tsx` /
+  `UIThread.tsx` need no changes.
+
+Known limitation (out of scope, examples don't hit it): **async ops** (`mapAsync`,
+`onSubmittedWorkDone`, …) invoked *on a worklet runtime* still settle their Promise via the
+object's creation-runtime context (main), not the calling worklet runtime — the example worklets
+only do synchronous rendering + present (device/adapter are created on the main runtime). Routing
+async settlement to the calling runtime would need the same calling-runtime detection applied to
+the 7 async sites; deferred until a use case needs it.
+
+Validation (local): native lib **compiles + links** for `arm64-v8a`; `cpplint` clean;
+`clang-format` applied; `yarn tsc`/`yarn lint` unaffected (no JS changed). On-device
+verification of the dedicated-worklet example is for the maintainer.
+
+**Phase 4 — `SurfaceRegistry` / surface-model rework** (proposed)
+The `SurfaceInfo` / `SurfaceRegistry` model (`cpp/rnwgpu/SurfaceRegistry.h`) predates the
+event-driven + auto-present work and is now the rough edge. Candidate improvements to scope:
+- **Surface thread-affinity.** Surface lifecycle (`configure`/`switchToOnscreen`/
+  `switchToOffscreen`/`resize`) runs on the **UI thread** (native view callbacks) while
+  `getCurrentTexture`/`presentFrame` run on the **owning runtime's render thread**. A single
+  `shared_mutex` serializes them but they're still cross-thread against a Dawn surface that
+  prefers single-thread access. Consider routing all surface ops through the owning runtime
+  (e.g. via the `RuntimeScheduler`), making affinity structural rather than lock-guarded.
+- **State clarity.** The on-screen-`surface` vs offscreen-`texture` duality is encoded as
+  `if (surface) … else …` branches throughout; a small explicit state (Offscreen / Onscreen)
+  would remove the implicit coupling and the `switchToOnscreen` flush path's validation cost
+  (its existing `// TODO: faster way without validation?`).
+- **Dead/again-evaluated fields.** e.g. the stored `wgpu::Instance gpu` member appears unused;
+  audit members now that present/`hasSurface` were added.
+- **Lifetime vs `contextId`.** Registry keyed by a JS-side incrementing `int`; `FrameDriver`
+  now also keys pending presents by `contextId`. Confirm teardown ordering (view dealloc →
+  `cancelPresent` + `removeSurfaceInfo`) is race-free under the new threading.
+
+**Phase 5 — Validation**
 ```bash
 yarn tsc && yarn lint
 yarn workspace react-native-wgpu test         # offscreen readback + demo specs
diff --git a/packages/webgpu/android/src/main/java/com/webgpu/WebGPUFrameDriver.java b/packages/webgpu/android/src/main/java/com/webgpu/WebGPUFrameDriver.java
new file mode 100644
index 000000000..03a1d2c29
--- /dev/null
+++ b/packages/webgpu/android/src/main/java/com/webgpu/WebGPUFrameDriver.java
@@ -0,0 +1,66 @@
+package com.webgpu;
+
+import android.os.Handler;
+import android.os.Looper;
+import android.view.Choreographer;
+
+/**
+ * Drives WebGPU auto-present from the main-thread {@link Choreographer},
+ * replacing the manual {@code context.present()} call.
+ *
+ * <p>{@link #start()} / {@link #stop()} are invoked from native code
+ * (rnwgpu::FrameDriver::setPlatformVSync) on arbitrary threads; both hop to the
+ * main thread. While running, {@link #doFrame(long)} calls back into native
+ * once per vsync, where pending surfaces are presented.
+ */
+public class WebGPUFrameDriver implements Choreographer.FrameCallback {
+  private static final WebGPUFrameDriver INSTANCE = new WebGPUFrameDriver();
+
+  private final Handler mainHandler = new Handler(Looper.getMainLooper());
+  private boolean running = false;
+
+  private WebGPUFrameDriver() {}
+
+  /** Called from native (any thread). */
+  public static void start() {
+    INSTANCE.startInternal();
+  }
+
+  /** Called from native (any thread). */
+  public static void stop() {
+    INSTANCE.stopInternal();
+  }
+
+  private void startInternal() {
+    mainHandler.post(
+        () -> {
+          if (running) {
+            return;
+          }
+          running = true;
+          Choreographer.getInstance().postFrameCallback(this);
+        });
+  }
+
+  private void stopInternal() {
+    mainHandler.post(
+        () -> {
+          if (!running) {
+            return;
+          }
+          running = false;
+          Choreographer.getInstance().removeFrameCallback(this);
+        });
+  }
+
+  @Override
+  public void doFrame(long frameTimeNanos) {
+    if (!running) {
+      return;
+    }
+    nativeOnVSync();
+    Choreographer.getInstance().postFrameCallback(this);
+  }
+
+  private static native void nativeOnVSync();
+}
diff --git a/packages/webgpu/apple/WebGPUFrameDriver.h b/packages/webgpu/apple/WebGPUFrameDriver.h
new file mode 100644
index 000000000..aacae84ee
--- /dev/null
+++ b/packages/webgpu/apple/WebGPUFrameDriver.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#import <Foundation/Foundation.h>
+
+// Objective-C wrapper around the platform vsync source (CADisplayLink) that
+// drives rnwgpu::FrameDriver::onVSync() once per frame. start/stop are invoked
+// by the C++ FrameDriver via setPlatformVSync; both hop to the main thread.
+@interface WebGPUFrameDriver : NSObject
+
++ (void)start;
++ (void)stop;
+
+@end
diff --git a/packages/webgpu/apple/WebGPUFrameDriver.mm b/packages/webgpu/apple/WebGPUFrameDriver.mm
new file mode 100644
index 000000000..1d302e2fa
--- /dev/null
+++ b/packages/webgpu/apple/WebGPUFrameDriver.mm
@@ -0,0 +1,88 @@
+#import "WebGPUFrameDriver.h"
+
+#import "RNWGUIKit.h"
+#import <QuartzCore/QuartzCore.h>
+
+#include "FrameDriver.h"
+
+@implementation WebGPUFrameDriver
+
++ (void)onFrame {
+  rnwgpu::FrameDriver::getInstance().onVSync();
+}
+
+#if !TARGET_OS_OSX
+
+// iOS / tvOS: CADisplayLink on the main run loop, paused/resumed for
+// start/stop.
+static CADisplayLink *sDisplayLink = nil;
+
++ (void)tick:(CADisplayLink *)link {
+  [WebGPUFrameDriver onFrame];
+}
+
++ (void)start {
+  dispatch_async(dispatch_get_main_queue(), ^{
+    if (sDisplayLink == nil) {
+      sDisplayLink = [CADisplayLink displayLinkWithTarget:self
+                                                 selector:@selector(tick:)];
+      [sDisplayLink addToRunLoop:[NSRunLoop mainRunLoop]
+                         forMode:NSRunLoopCommonModes];
+    }
+    sDisplayLink.paused = NO;
+  });
+}
+
++ (void)stop {
+  dispatch_async(dispatch_get_main_queue(), ^{
+    sDisplayLink.paused = YES;
+  });
+}
+
+#else // TARGET_OS_OSX
+
+// macOS: CADisplayLink is available via NSScreen on 14.0+. On older systems we
+// fall back to an NSTimer at ~60Hz (not vsync-aligned, but keeps auto-present
+// working). FrameDriver self-idles cheaply when nothing is rendering.
+static id sDisplayLink = nil;
+
++ (void)tick:(id)sender {
+  [WebGPUFrameDriver onFrame];
+}
+
++ (void)start {
+  dispatch_async(dispatch_get_main_queue(), ^{
+    if (sDisplayLink == nil) {
+      if (@available(macOS 14.0, *)) {
+        CADisplayLink *link =
+            [NSScreen.mainScreen displayLinkWithTarget:self
+                                              selector:@selector(tick:)];
+        [link addToRunLoop:[NSRunLoop mainRunLoop]
+                   forMode:NSRunLoopCommonModes];
+        sDisplayLink = link;
+      } else {
+        sDisplayLink = [NSTimer scheduledTimerWithTimeInterval:1.0 / 60.0
+                                                        target:self
+                                                      selector:@selector(tick:)
+                                                      userInfo:nil
+                                                       repeats:YES];
+      }
+    }
+    if ([sDisplayLink isKindOfClass:[CADisplayLink class]]) {
+      ((CADisplayLink *)sDisplayLink).paused = NO;
+    }
+  });
+}
+
++ (void)stop {
+  dispatch_async(dispatch_get_main_queue(), ^{
+    if ([sDisplayLink isKindOfClass:[CADisplayLink class]]) {
+      ((CADisplayLink *)sDisplayLink).paused = YES;
+    }
+    // NSTimer fallback keeps firing; onVSync is a cheap no-op while idle.
+  });
+}
+
+#endif // TARGET_OS_OSX
+
+@end
diff --git a/packages/webgpu/cpp/rnwgpu/FrameDriver.cpp b/packages/webgpu/cpp/rnwgpu/FrameDriver.cpp
new file mode 100644
index 000000000..792940e5e
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/FrameDriver.cpp
@@ -0,0 +1,81 @@
+#include "FrameDriver.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace jsi = facebook::jsi;
+
+namespace rnwgpu {
+
+FrameDriver &FrameDriver::getInstance() {
+  static FrameDriver instance;
+  return instance;
+}
+
+void FrameDriver::setPlatformVSync(std::function<void()> start,
+                                   std::function<void()> stop) {
+  std::lock_guard<std::mutex> lock(_mutex);
+  _start = std::move(start);
+  _stop = std::move(stop);
+}
+
+void FrameDriver::requestPresent(
+    int contextId, std::shared_ptr<SurfaceInfo> surface,
+    std::shared_ptr<async::RuntimeScheduler> scheduler) {
+  if (!surface || !scheduler) {
+    return;
+  }
+
+  std::function<void()> startToCall;
+  {
+    std::lock_guard<std::mutex> lock(_mutex);
+    _pending[contextId] = {std::move(surface), std::move(scheduler)};
+    _idleFrames = 0;
+    if (!_running && _start) {
+      _running = true;
+      startToCall = _start;
+    }
+  }
+
+  // Invoked outside the lock: the platform start hops to the UI thread.
+  if (startToCall) {
+    startToCall();
+  }
+}
+
+void FrameDriver::cancelPresent(int contextId) {
+  std::lock_guard<std::mutex> lock(_mutex);
+  _pending.erase(contextId);
+}
+
+void FrameDriver::onVSync() {
+  std::vector<Pending> toPresent;
+  std::function<void()> stopToCall;
+  {
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (!_pending.empty()) {
+      toPresent.reserve(_pending.size());
+      for (auto &entry : _pending) {
+        toPresent.push_back(std::move(entry.second));
+      }
+      _pending.clear();
+      _idleFrames = 0;
+    } else if (_running && ++_idleFrames >= kMaxIdleFrames) {
+      _running = false;
+      stopToCall = _stop;
+    }
+  }
+
+  for (auto &pending : toPresent) {
+    auto surface = pending.surface;
+    pending.scheduler->scheduleOnJS(
+        [surface](jsi::Runtime & /*runtime*/) { surface->presentFrame(); });
+  }
+
+  if (stopToCall) {
+    stopToCall();
+  }
+}
+
+} // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/FrameDriver.h b/packages/webgpu/cpp/rnwgpu/FrameDriver.h
new file mode 100644
index 000000000..c16fedabf
--- /dev/null
+++ b/packages/webgpu/cpp/rnwgpu/FrameDriver.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "SurfaceRegistry.h"
+#include "rnwgpu/async/RuntimeScheduler.h"
+
+namespace rnwgpu {
+
+/**
+ * Global vsync-driven auto-present coordinator. Replaces the manual
+ * `context.present()` call.
+ *
+ * Flow:
+ *   - `GPUCanvasContext::getCurrentTexture()` (JS thread) calls
+ * `requestPresent` for its surface, tagged with the owning runtime's
+ * RuntimeScheduler.
+ *   - A platform vsync source (iOS CADisplayLink / Android Choreographer) calls
+ *     `onVSync()` on the UI thread once per frame.
+ *   - On each vsync, every surface that requested a present has its present
+ *     dispatched onto its owning runtime's JS thread (so `Surface.Present()`
+ * and the Apple Metal scheduling wait run on the same thread that did
+ *     getCurrentTexture / submit, preserving Dawn surface thread-affinity and
+ *     present-after-submit ordering via FIFO on that loop).
+ *
+ * The vsync source is request-driven: it is started when the first present is
+ * requested and stopped after a few idle frames, so an idle (non-rendering) app
+ * costs zero CPU.
+ */
+class FrameDriver {
+public:
+  static FrameDriver &getInstance();
+
+  /**
+   * Register how to start/stop the platform vsync source. `start`/`stop` are
+   * invoked when presents begin/cease; each implementation is responsible for
+   * hopping to the UI thread as needed. Called once per platform at init.
+   */
+  void setPlatformVSync(std::function<void()> start,
+                        std::function<void()> stop);
+
+  /**
+   * Request that `surface` be presented at the next vsync. Coalesced per
+   * contextId (at most one present per surface per frame). Thread-safe; called
+   * from a JS thread inside getCurrentTexture. Surfaces with no on-screen
+   * `wgpu::Surface` (offscreen) should not be registered.
+   */
+  void requestPresent(int contextId, std::shared_ptr<SurfaceInfo> surface,
+                      std::shared_ptr<async::RuntimeScheduler> scheduler);
+
+  /**
+   * Drop any pending present for a surface (e.g. when its view is torn down).
+   * Thread-safe.
+   */
+  void cancelPresent(int contextId);
+
+  /** Called by the platform vsync source on the UI thread, once per frame. */
+  void onVSync();
+
+private:
+  FrameDriver() = default;
+
+  struct Pending {
+    std::shared_ptr<SurfaceInfo> surface;
+    std::shared_ptr<async::RuntimeScheduler> scheduler;
+  };
+
+  // Number of consecutive empty frames before the vsync source is stopped.
+  // A small grace period avoids start/stop thrash during continuous rendering.
+  static constexpr int kMaxIdleFrames = 3;
+
+  std::mutex _mutex;
+  std::unordered_map<int, Pending> _pending;
+  std::function<void()> _start;
+  std::function<void()> _stop;
+  bool _running = false;
+  int _idleFrames = 0;
+};
+
+} // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
index 7a2c32886..2eb76c0b4 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
@@ -32,7 +32,15 @@ void GPUCanvasContext::configure(
 
 void GPUCanvasContext::unconfigure() {}
 
-std::shared_ptr<GPUTexture> GPUCanvasContext::getCurrentTexture() {
+jsi::Value GPUCanvasContext::getCurrentTexture(jsi::Runtime &runtime,
+                                               const jsi::Value & /*thisValue*/,
+                                               const jsi::Value * /*args*/,
+                                               size_t /*count*/) {
+  // Main JS runtime owns a RuntimeContext; worklet runtimes (Reanimated UI /
+  // dedicated, Vision Camera frame processors, …) do not.
+  auto runtimeContext = async::RuntimeContext::get(runtime);
+  const bool isMainRuntime = runtimeContext != nullptr;
+
   auto prevSize = _surfaceInfo->getConfig();
   auto width = _canvas->getWidth();
   auto height = _canvas->getHeight();
@@ -40,27 +48,46 @@ std::shared_ptr<GPUTexture> GPUCanvasContext::getCurrentTexture() {
   if (sizeHasChanged) {
     _surfaceInfo->reconfigure(width, height);
   }
+
+  // Worklet-runtime auto-present: present the PREVIOUS frame synchronously on
+  // this thread, just before acquiring the next texture. By now that frame's
+  // submit has already happened (during the previous frame's work), and this
+  // runs on the same thread that did getCurrentTexture/submit — preserving Dawn
+  // surface thread-affinity. We can't use the UI-thread FrameDriver here, and
+  // scheduling onto the worklet runtime's own task queue is unsafe in general
+  // (e.g. Vision Camera's queue hops through JNI and crashes off the JS
+  // thread), so we present inline at the natural swapchain boundary instead.
+  if (!isMainRuntime && _hasUnpresentedFrame && _surfaceInfo->hasSurface()) {
+    _surfaceInfo->presentFrame();
+    _hasUnpresentedFrame = false;
+  }
+
   auto texture = _surfaceInfo->getCurrentTexture();
 
-  // Auto-present: acquiring the current texture schedules a present for this
-  // surface at the next vsync (spec-aligned "update the rendering" after the
-  // frame). Replaces the old explicit context.present(). Offscreen surfaces
-  // have no wgpu::Surface, so skip them (their texture is read back directly).
   auto size = _surfaceInfo->getSize();
   _canvas->setClientWidth(size.width);
   _canvas->setClientHeight(size.height);
+
+  // Auto-present: acquiring the current texture arranges for this frame to be
+  // presented (spec-aligned "update the rendering" after the frame). Replaces
+  // the old explicit context.present(). Offscreen surfaces have no
+  // wgpu::Surface, so skip them (their texture is read back directly).
   if (_surfaceInfo->hasSurface()) {
-    // Phase 2: dispatch the present on the main runtime (the only runtime that
-    // owns WebGPU rendering today). Phase 3 will tag this with the *calling*
-    // runtime so worklet-runtime rendering (e.g. the Reanimated example)
-    // presents on its own JS thread, preserving Dawn surface thread-affinity.
-    FrameDriver::getInstance().requestPresent(_contextId, _surfaceInfo,
-                                              _gpu->getContext()->scheduler());
+    if (isMainRuntime) {
+      // Main runtime: drive present from the global vsync FrameDriver (handles
+      // one-shot renders too, since it presents the current frame at vsync).
+      FrameDriver::getInstance().requestPresent(_contextId, _surfaceInfo,
+                                                runtimeContext->scheduler());
+    } else {
+      // Worklet runtime: present at the next acquire (see above).
+      _hasUnpresentedFrame = true;
+    }
   }
 
   // Pass reportsMemoryPressure=false to avoid triggering spurious Hermes GC
   // cycles every frame since the canvas texture doesn't own the buffer.
-  return std::make_shared<GPUTexture>(texture, "", false);
+  auto gpuTexture = std::make_shared<GPUTexture>(texture, "", false);
+  return JSIConverter<std::shared_ptr<GPUTexture>>::toJSI(runtime, gpuTexture);
 }
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
index 2ab5d69c2..bdf6bee8c 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
@@ -53,13 +53,21 @@ class GPUCanvasContext : public NativeObject<GPUCanvasContext> {
   inline const wgpu::Surface get() { return nullptr; }
   void configure(std::shared_ptr<GPUCanvasConfiguration> configuration);
   void unconfigure();
-  std::shared_ptr<GPUTexture> getCurrentTexture();
+  // Full-control signature so we can learn the *calling* runtime and route the
+  // auto-present onto its own thread (main runtime → FrameDriver vsync; worklet
+  // runtime → presented inline at the next getCurrentTexture).
+  jsi::Value getCurrentTexture(jsi::Runtime &runtime,
+                               const jsi::Value &thisValue,
+                               const jsi::Value *args, size_t count);
 
 private:
   int _contextId;
   std::shared_ptr<Canvas> _canvas;
   std::shared_ptr<SurfaceInfo> _surfaceInfo;
   std::shared_ptr<GPU> _gpu;
+  // For worklet-runtime auto-present: true when a frame was acquired on a
+  // worklet runtime and not yet presented (presented at the next acquire).
+  bool _hasUnpresentedFrame = false;
 };
 
 } // namespace rnwgpu

From ba9efe94ecf8942f9e834a75422447ee027505ad Mon Sep 17 00:00:00 2001
From: William Candillon <wcandillon@gmail.com>
Date: Tue, 2 Jun 2026 17:33:07 +0200
Subject: [PATCH 3/4] :wrench:

---
 README.md                                     | 15 +++-
 apps/example/src/Reanimated/Reanimated.tsx    |  3 +
 .../example/src/VisionCamera/VisionCamera.tsx |  3 +
 docs/refactor-async-present-plan.md           | 65 +++++++++------
 packages/webgpu/README.md                     | 15 +++-
 .../cpp/rnwgpu/api/GPUCanvasContext.cpp       | 80 +++++++++++--------
 .../webgpu/cpp/rnwgpu/api/GPUCanvasContext.h  | 12 +--
 packages/webgpu/src/Canvas.tsx                | 15 +++-
 packages/webgpu/src/Offscreen.ts              |  4 +
 packages/webgpu/src/WebPolyfillGPUModule.ts   |  5 +-
 packages/webgpu/src/types.ts                  | 15 +++-
 11 files changed, 159 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index d7415053b..433d498fa 100644
--- a/README.md
+++ b/README.md
@@ -172,7 +172,7 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-Frame presentation is automatic. Once you acquire the frame's texture with `context.getCurrentTexture()` and submit your commands, the frame is presented on the next display refresh (driven by a global vsync source: `CADisplayLink` on iOS, `Choreographer` on Android). There is no `present()` call.
+On the **main JS runtime** and the **Reanimated UI runtime**, frame presentation is automatic: once you acquire the frame's texture with `context.getCurrentTexture()` and submit your commands, the frame is presented on the next display refresh (driven by a global vsync source: `CADisplayLink` on iOS, `Choreographer` on Android). There is no `present()` call.
 
 ```tsx
 // draw
@@ -181,6 +181,19 @@ device.queue.submit([commandEncoder.finish()]);
 // The frame is presented automatically on the next vsync.
 ```
 
+When you render from a **dedicated worklet runtime** (e.g. `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor), it runs on its own thread where present can't be driven automatically. Call `context.present()` yourself after submitting:
+
+```tsx
+const onFrame = () => {
+  "worklet";
+  // draw on the dedicated runtime's thread
+  device.queue.submit([commandEncoder.finish()]);
+  context.present(); // required on dedicated worklet runtimes; a no-op on JS/UI
+};
+```
+
+`present()` is safe to call from a worklet that runs on either the UI runtime or a dedicated runtime: it presents on the dedicated runtime and does nothing on the JS/UI runtime (which auto-present).
+
 ### Canvas Transparency
 
 On Android, the `alphaMode` property is ignored when configuring the canvas.
diff --git a/apps/example/src/Reanimated/Reanimated.tsx b/apps/example/src/Reanimated/Reanimated.tsx
index 2f8b5e5cb..3761c90f9 100644
--- a/apps/example/src/Reanimated/Reanimated.tsx
+++ b/apps/example/src/Reanimated/Reanimated.tsx
@@ -78,6 +78,9 @@ export const webGPUDemo = (
     passEncoder.end();
 
     device.queue.submit([commandEncoder.finish()]);
+    // Needed on a dedicated worklet runtime (DedicatedThread); a no-op on the
+    // UI runtime (UIThread), where present is automatic.
+    context.present();
 
     if (runAnimation.value) {
       requestAnimationFrame(frame);
diff --git a/apps/example/src/VisionCamera/VisionCamera.tsx b/apps/example/src/VisionCamera/VisionCamera.tsx
index cba2d2948..f6c6c95bd 100644
--- a/apps/example/src/VisionCamera/VisionCamera.tsx
+++ b/apps/example/src/VisionCamera/VisionCamera.tsx
@@ -613,6 +613,9 @@ const CameraView = () => {
           pass.draw(3);
           pass.end();
           device.queue.submit([encoder.finish()]);
+          // Vision Camera frame processors run on a dedicated worklet runtime,
+          // so present explicitly (auto-present only covers the JS/UI runtime).
+          context.present();
           // The work sampling it is submitted, so end the external texture's
           // access window now to release the camera frame's surface promptly
           // (don't wait for GC, which would starve the frame buffer pool).
diff --git a/docs/refactor-async-present-plan.md b/docs/refactor-async-present-plan.md
index 65490af29..82e0de054 100644
--- a/docs/refactor-async-present-plan.md
+++ b/docs/refactor-async-present-plan.md
@@ -308,36 +308,49 @@ happened to land on the right thread), but the **dedicated `createWorkletRuntime
 (`Reanimated/DedicatedThread.tsx`, `runOnRuntime`) crashed — its render thread is its own, so a
 main-thread present violated Dawn surface thread-affinity.
 
-**Decision (confirmed with the user): self-scheduled present, no native worklets dependency.**
-Rather than link `react-native-worklets` natively and have the FrameDriver dispatch via
-`WorkletRuntime::schedule` (the original plan / Spike 1 primary), worklet runtimes now schedule
-their own present on their own event loop. This avoids a new native build dependency entirely
-and is fully buildable/validatable locally (it is Spike 1's documented "JS-scheduling"
-contingency).
-
-Implementation (native only; no JS/build-system changes):
+**Decision (confirmed with the user): auto-present on the JS + UI runtimes, explicit
+`ctx.present()` on dedicated worklet runtimes. No native worklets dependency.** Rather than link
+`react-native-worklets` natively and dispatch via `WorkletRuntime::schedule` (the original plan /
+Spike 1 primary), the FrameDriver covers the JS and UI runtimes; dedicated runtimes — which run
+on their own thread with no safe scheduler/vsync hook — keep an explicit `present()`. (A
+scheduler-free auto path for dedicated runtimes was prototyped but rejected — see below — because
+it added one frame of latency and never presented a one-shot frame.) This needs no new native
+build dependency and is fully buildable/validatable locally.
+
+Implementation:
 - `GPUCanvasContext::getCurrentTexture` switched to the full-control HostFunction signature
   (`jsi::Value(rt, thisVal, args, count)`, same pattern as `RNWebGPU::createImageBitmap`) so it
-  learns the **calling** runtime. New `schedulePresent(runtime)`:
+  learns the **calling** runtime. Present routing:
   - **Main runtime** (`RuntimeContext::get(runtime)` is non-null): unchanged — register with the
     global vsync `FrameDriver` using that runtime's scheduler.
-  - **Any worklet runtime** (no `RuntimeContext` — Reanimated UI/dedicated, Vision Camera frame
-    processors, …): **present-on-next-acquire**. `getCurrentTexture` presents the *previous*
-    frame synchronously (inline, on the calling thread) just before acquiring the next texture;
-    by then the previous frame's submit has happened, and present runs on the same thread that
-    rendered it. This is the natural swapchain boundary and needs no scheduler.
-
-    Why not schedule onto the runtime's own loop: two earlier attempts failed. (1)
-    `queueMicrotask` is **disabled** on worklet runtimes (throws "microtasks are disabled in this
-    runtime"). (2) `setImmediate`/`setTimeout` exist but route through the runtime's `EventLoop`
-    `AsyncQueue`, which for **Vision Camera** is a custom `NativeThreadAsyncQueue` that hops back
-    through JNI (`fbjni Environment::current()`) and **crashes** when pushed from a
-    non-JVM-attached thread. Present-on-next-acquire avoids the runtime's task queue entirely.
-    Trade-off: one frame of latency, and a worklet that renders exactly once would not present
-    its single frame (continuous loops — rAF, camera frames — are unaffected; the main runtime's
-    one-shot case is covered by the FrameDriver).
-- `Reanimated.tsx` already had `present()` removed in Phase 2; `DedicatedThread.tsx` /
-  `UIThread.tsx` need no changes.
+  - **Reanimated UI runtime** (`globalThis.__RUNTIME_KIND === 2`, worklets' `RuntimeKind::UI`):
+    also auto-present via the FrameDriver + main scheduler. The UI runtime is reached correctly
+    by this path (Phase 2 confirmed it), so no `present()` is needed.
+  - **Dedicated worklet runtimes** (`RuntimeKind::Worker`, or any untagged/unknown worklet
+    runtime — e.g. Vision Camera frame processors): **explicit `ctx.present()`**, kept in the
+    public API for exactly this case. They run on their own thread with no safe scheduler/vsync
+    hook, so present is called synchronously by the author after `submit`, on that thread
+    (preserving Dawn surface thread-affinity).
+
+  `ctx.present()` is a **no-op on the JS / UI runtime** (they auto-present), which makes it safe
+  to call from a worklet shared between the UI and a dedicated runtime (the example's
+  `webGPUDemo`). Runtime classification uses `RuntimeContext::get(rt)` (main) and the stable
+  worklets global `__RUNTIME_KIND` (`ReactNative=1`, `UI=2`, `Worker=3`); no worklets headers
+  are linked.
+
+  Two scheduler-based approaches were tried and rejected before landing here: (1)
+  `queueMicrotask` is **disabled** on worklet runtimes (throws); (2) `setImmediate`/`setTimeout`
+  exist but route through the runtime's `EventLoop` `AsyncQueue`, which for **Vision Camera** is
+  a custom `NativeThreadAsyncQueue` that hops through JNI (`fbjni Environment::current()`) and
+  **crashes** when pushed from a non-JVM-attached thread. A scheduler-free
+  "present-on-next-acquire" fallback worked everywhere but added one frame of latency and never
+  presented a one-shot frame, so the explicit-`present()`-on-dedicated split was chosen instead.
+- JS surface: `present()` re-added to `RNCanvasContext` (`src/Canvas.tsx`, `src/types.ts`,
+  documented dedicated-only) and as a no-op on `Offscreen.ts` / `WebPolyfillGPUModule.ts`. Native
+  `GPUCanvasContext::present` re-added (full-control signature; no-op on auto-presented runtimes).
+- Examples: `present()` re-added to `Reanimated/Reanimated.tsx`'s shared `webGPUDemo` (no-op on
+  UIThread, real on DedicatedThread) and to `VisionCamera.tsx`'s frame processor. Both READMEs'
+  "Frame Scheduling" sections document the JS/UI-auto vs dedicated-manual split.
 
 Known limitation (out of scope, examples don't hit it): **async ops** (`mapAsync`,
 `onSubmittedWorkDone`, …) invoked *on a worklet runtime* still settle their Promise via the
diff --git a/packages/webgpu/README.md b/packages/webgpu/README.md
index d7415053b..433d498fa 100644
--- a/packages/webgpu/README.md
+++ b/packages/webgpu/README.md
@@ -172,7 +172,7 @@ ctx.canvas.height = ctx.canvas.clientHeight * PixelRatio.get();
 
 ### Frame Scheduling
 
-Frame presentation is automatic. Once you acquire the frame's texture with `context.getCurrentTexture()` and submit your commands, the frame is presented on the next display refresh (driven by a global vsync source: `CADisplayLink` on iOS, `Choreographer` on Android). There is no `present()` call.
+On the **main JS runtime** and the **Reanimated UI runtime**, frame presentation is automatic: once you acquire the frame's texture with `context.getCurrentTexture()` and submit your commands, the frame is presented on the next display refresh (driven by a global vsync source: `CADisplayLink` on iOS, `Choreographer` on Android). There is no `present()` call.
 
 ```tsx
 // draw
@@ -181,6 +181,19 @@ device.queue.submit([commandEncoder.finish()]);
 // The frame is presented automatically on the next vsync.
 ```
 
+When you render from a **dedicated worklet runtime** (e.g. `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame processor), it runs on its own thread where present can't be driven automatically. Call `context.present()` yourself after submitting:
+
+```tsx
+const onFrame = () => {
+  "worklet";
+  // draw on the dedicated runtime's thread
+  device.queue.submit([commandEncoder.finish()]);
+  context.present(); // required on dedicated worklet runtimes; a no-op on JS/UI
+};
+```
+
+`present()` is safe to call from a worklet that runs on either the UI runtime or a dedicated runtime: it presents on the dedicated runtime and does nothing on the JS/UI runtime (which auto-present).
+
 ### Canvas Transparency
 
 On Android, the `alphaMode` property is ignored when configuring the canvas.
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
index 2eb76c0b4..c4390ba6d 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.cpp
@@ -6,6 +6,29 @@
 
 namespace rnwgpu {
 
+namespace {
+// Runtimes whose present is automatic (no ctx.present() needed): the main JS
+// runtime and the Reanimated UI runtime. Both are reached correctly by the
+// global vsync FrameDriver dispatching through the main runtime's scheduler.
+// Dedicated worklet runtimes (createWorkletRuntime, Vision Camera frame
+// processors, …) run on their own thread with no safe scheduler hook, so they
+// present explicitly via ctx.present().
+bool isAutoPresentedRuntime(jsi::Runtime &runtime) {
+  if (async::RuntimeContext::get(runtime) != nullptr) {
+    return true; // main JS runtime
+  }
+  // Worklets tags every runtime with a numeric `__RUNTIME_KIND`
+  // (worklets::RuntimeKind: ReactNative=1, UI=2, Worker=3). Auto-present only
+  // the UI runtime; treat Worker / unknown / untagged as needing ctx.present().
+  auto kind = runtime.global().getProperty(runtime, "__RUNTIME_KIND");
+  if (kind.isNumber()) {
+    constexpr int kRuntimeKindUI = 2;
+    return static_cast<int>(kind.asNumber()) == kRuntimeKindUI;
+  }
+  return false;
+}
+} // namespace
+
 void GPUCanvasContext::configure(
     std::shared_ptr<GPUCanvasConfiguration> configuration) {
   Convertor conv;
@@ -36,11 +59,6 @@ jsi::Value GPUCanvasContext::getCurrentTexture(jsi::Runtime &runtime,
                                                const jsi::Value & /*thisValue*/,
                                                const jsi::Value * /*args*/,
                                                size_t /*count*/) {
-  // Main JS runtime owns a RuntimeContext; worklet runtimes (Reanimated UI /
-  // dedicated, Vision Camera frame processors, …) do not.
-  auto runtimeContext = async::RuntimeContext::get(runtime);
-  const bool isMainRuntime = runtimeContext != nullptr;
-
   auto prevSize = _surfaceInfo->getConfig();
   auto width = _canvas->getWidth();
   auto height = _canvas->getHeight();
@@ -49,39 +67,21 @@ jsi::Value GPUCanvasContext::getCurrentTexture(jsi::Runtime &runtime,
     _surfaceInfo->reconfigure(width, height);
   }
 
-  // Worklet-runtime auto-present: present the PREVIOUS frame synchronously on
-  // this thread, just before acquiring the next texture. By now that frame's
-  // submit has already happened (during the previous frame's work), and this
-  // runs on the same thread that did getCurrentTexture/submit — preserving Dawn
-  // surface thread-affinity. We can't use the UI-thread FrameDriver here, and
-  // scheduling onto the worklet runtime's own task queue is unsafe in general
-  // (e.g. Vision Camera's queue hops through JNI and crashes off the JS
-  // thread), so we present inline at the natural swapchain boundary instead.
-  if (!isMainRuntime && _hasUnpresentedFrame && _surfaceInfo->hasSurface()) {
-    _surfaceInfo->presentFrame();
-    _hasUnpresentedFrame = false;
-  }
-
   auto texture = _surfaceInfo->getCurrentTexture();
 
   auto size = _surfaceInfo->getSize();
   _canvas->setClientWidth(size.width);
   _canvas->setClientHeight(size.height);
 
-  // Auto-present: acquiring the current texture arranges for this frame to be
-  // presented (spec-aligned "update the rendering" after the frame). Replaces
-  // the old explicit context.present(). Offscreen surfaces have no
-  // wgpu::Surface, so skip them (their texture is read back directly).
-  if (_surfaceInfo->hasSurface()) {
-    if (isMainRuntime) {
-      // Main runtime: drive present from the global vsync FrameDriver (handles
-      // one-shot renders too, since it presents the current frame at vsync).
-      FrameDriver::getInstance().requestPresent(_contextId, _surfaceInfo,
-                                                runtimeContext->scheduler());
-    } else {
-      // Worklet runtime: present at the next acquire (see above).
-      _hasUnpresentedFrame = true;
-    }
+  // Auto-present on the JS / UI runtime: acquiring the current texture
+  // schedules a present for this surface at the next vsync (spec-aligned
+  // "update the rendering" after the frame), dispatched through the main
+  // runtime's scheduler. Dedicated worklet runtimes instead call ctx.present()
+  // explicitly on their own thread. Offscreen surfaces have no wgpu::Surface,
+  // so skip them (their texture is read back directly).
+  if (_surfaceInfo->hasSurface() && isAutoPresentedRuntime(runtime)) {
+    FrameDriver::getInstance().requestPresent(_contextId, _surfaceInfo,
+                                              _gpu->getContext()->scheduler());
   }
 
   // Pass reportsMemoryPressure=false to avoid triggering spurious Hermes GC
@@ -90,4 +90,20 @@ jsi::Value GPUCanvasContext::getCurrentTexture(jsi::Runtime &runtime,
   return JSIConverter<std::shared_ptr<GPUTexture>>::toJSI(runtime, gpuTexture);
 }
 
+jsi::Value GPUCanvasContext::present(jsi::Runtime &runtime,
+                                     const jsi::Value & /*thisValue*/,
+                                     const jsi::Value * /*args*/,
+                                     size_t /*count*/) {
+  // Only meaningful on a dedicated worklet runtime, where present can't be
+  // automated. On the JS / UI runtime present is automatic, so this is a no-op
+  // there — which makes it safe to call from a worklet shared between the UI
+  // runtime and a dedicated runtime. Presents synchronously on the calling
+  // thread (the one that did getCurrentTexture / submit), preserving Dawn
+  // surface thread-affinity.
+  if (!isAutoPresentedRuntime(runtime) && _surfaceInfo->hasSurface()) {
+    _surfaceInfo->presentFrame();
+  }
+  return jsi::Value::undefined();
+}
+
 } // namespace rnwgpu
diff --git a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
index bdf6bee8c..a2e80b7cc 100644
--- a/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
+++ b/packages/webgpu/cpp/rnwgpu/api/GPUCanvasContext.h
@@ -47,27 +47,27 @@ class GPUCanvasContext : public NativeObject<GPUCanvasContext> {
                   &GPUCanvasContext::unconfigure);
     installMethod(runtime, prototype, "getCurrentTexture",
                   &GPUCanvasContext::getCurrentTexture);
+    installMethod(runtime, prototype, "present", &GPUCanvasContext::present);
   }
 
   // TODO: is this ok?
   inline const wgpu::Surface get() { return nullptr; }
   void configure(std::shared_ptr<GPUCanvasConfiguration> configuration);
   void unconfigure();
-  // Full-control signature so we can learn the *calling* runtime and route the
-  // auto-present onto its own thread (main runtime → FrameDriver vsync; worklet
-  // runtime → presented inline at the next getCurrentTexture).
+  // Full-control signatures so we can learn the *calling* runtime and decide
+  // how this frame is presented (auto on the JS / UI runtime; explicit
+  // ctx.present() on a dedicated worklet runtime).
   jsi::Value getCurrentTexture(jsi::Runtime &runtime,
                                const jsi::Value &thisValue,
                                const jsi::Value *args, size_t count);
+  jsi::Value present(jsi::Runtime &runtime, const jsi::Value &thisValue,
+                     const jsi::Value *args, size_t count);
 
 private:
   int _contextId;
   std::shared_ptr<Canvas> _canvas;
   std::shared_ptr<SurfaceInfo> _surfaceInfo;
   std::shared_ptr<GPU> _gpu;
-  // For worklet-runtime auto-present: true when a frame was acquired on a
-  // worklet runtime and not yet presented (presented at the next acquire).
-  bool _hasUnpresentedFrame = false;
 };
 
 } // namespace rnwgpu
diff --git a/packages/webgpu/src/Canvas.tsx b/packages/webgpu/src/Canvas.tsx
index 7c2a47a6e..43c9621e7 100644
--- a/packages/webgpu/src/Canvas.tsx
+++ b/packages/webgpu/src/Canvas.tsx
@@ -19,9 +19,18 @@ export interface NativeCanvas {
   clientHeight: number;
 }
 
-// Auto-present (a global vsync FrameDriver) replaces the old manual present();
-// the native context is now just a spec GPUCanvasContext.
-export type RNCanvasContext = GPUCanvasContext;
+export type RNCanvasContext = GPUCanvasContext & {
+  /**
+   * Present the current frame.
+   *
+   * Only needed when rendering from a **dedicated worklet runtime** (e.g.
+   * `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame
+   * processor), which runs on its own thread. On the main JS runtime and the
+   * Reanimated UI runtime present is automatic (driven by a global vsync), so
+   * calling this there is a no-op. Call it after `queue.submit()`.
+   */
+  present: () => void;
+};
 
 export interface CanvasRef {
   getContextId: () => number;
diff --git a/packages/webgpu/src/Offscreen.ts b/packages/webgpu/src/Offscreen.ts
index 6ce2f589c..4deab8a1c 100644
--- a/packages/webgpu/src/Offscreen.ts
+++ b/packages/webgpu/src/Offscreen.ts
@@ -64,6 +64,10 @@ class GPUOffscreenCanvasContext implements GPUCanvasContext {
     throw new Error("Method not implemented.");
   }
 
+  present() {
+    // Offscreen contexts have nothing to present; readback is via getImageData.
+  }
+
   getDevice() {
     if (!this.device) {
       throw new Error("Device is not configured.");
diff --git a/packages/webgpu/src/WebPolyfillGPUModule.ts b/packages/webgpu/src/WebPolyfillGPUModule.ts
index 04229cd05..8b629a0c9 100644
--- a/packages/webgpu/src/WebPolyfillGPUModule.ts
+++ b/packages/webgpu/src/WebPolyfillGPUModule.ts
@@ -39,7 +39,10 @@ function makeWebGPUCanvasContext(
     canvas.setAttribute("height", pixelHeight);
   }
 
-  return canvas.getContext("webgpu")!;
+  const context = canvas.getContext("webgpu")!;
+  // On web there is no manual present; expose a no-op so RNCanvasContext's
+  // present() (used on native dedicated worklet runtimes) is callable here too.
+  return Object.assign(context, { present: () => {} });
 }
 
 // @ts-expect-error - polyfill for RNWebGPU native module
diff --git a/packages/webgpu/src/types.ts b/packages/webgpu/src/types.ts
index 0758c73f4..1608a4ff0 100644
--- a/packages/webgpu/src/types.ts
+++ b/packages/webgpu/src/types.ts
@@ -8,9 +8,18 @@ export interface NativeCanvas {
   clientHeight: number;
 }
 
-// Auto-present (a global vsync FrameDriver) replaces the old manual present();
-// the native context is now just a spec GPUCanvasContext.
-export type RNCanvasContext = GPUCanvasContext;
+export type RNCanvasContext = GPUCanvasContext & {
+  /**
+   * Present the current frame.
+   *
+   * Only needed when rendering from a **dedicated worklet runtime** (e.g.
+   * `createWorkletRuntime` / `runOnRuntime`, or a Vision Camera frame
+   * processor), which runs on its own thread. On the main JS runtime and the
+   * Reanimated UI runtime present is automatic (driven by a global vsync), so
+   * calling this there is a no-op. Call it after `queue.submit()`.
+   */
+  present: () => void;
+};
 
 export interface CanvasRef {
   getContextId: () => number;

From 0696aaa049c3ad89bb1b3c51535e442ea52fe82c Mon Sep 17 00:00:00 2001
From: William Candillon <wcandillon@gmail.com>
Date: Tue, 2 Jun 2026 18:00:49 +0200
Subject: [PATCH 4/4] Delete docs/refactor-async-present-plan.md

---
 docs/refactor-async-present-plan.md | 442 ----------------------------
 1 file changed, 442 deletions(-)
 delete mode 100644 docs/refactor-async-present-plan.md

diff --git a/docs/refactor-async-present-plan.md b/docs/refactor-async-present-plan.md
deleted file mode 100644
index 82e0de054..000000000
--- a/docs/refactor-async-present-plan.md
+++ /dev/null
@@ -1,442 +0,0 @@
-# Refactor: event-driven async + auto-present
-
-Status: **Phases 1–3 complete (local build/lint green). Phase 4 (SurfaceRegistry rework) proposed; Phase 5 = on-device validation.**
-Branch: `claude/keen-darwin-xeywa`
-
-This document is the handoff for moving the async + present refactor forward. Phase 0
-(spikes) needs a real local machine: installed `node_modules`, a Dawn build, and the
-iOS/Android toolchains. Everything below the "How to resume locally" section is meant to
-be executed on your computer, not in the web container.
-
----
-
-## Goals (locked)
-
-- **Async**: replace the JS-thread polling loop with a **background `WaitAny` GPU thread**
-  (Dawn `TimedWaitAny` is already enabled — `packages/webgpu/cpp/rnwgpu/api/GPU.cpp:17-23`).
-- **Present**: **remove `context.present()` entirely** (breaking) in favor of a **global
-  Choreographer / CADisplayLink-driven auto-present**.
-- **Scope**: first-class for **all runtimes** — main JS, the reanimated UI runtime, and
-  `createWorkletRuntime` dedicated runtimes.
-
----
-
-## What exists today (the two problems)
-
-### Async (polling) — `packages/webgpu/cpp/rnwgpu/async/`
-- Every async op (`requestAdapter`, `requestDevice`, `mapAsync`, `onSubmittedWorkDone`,
-  `createRender/ComputePipelineAsync`, `popErrorScope`) registers a Dawn callback with
-  `CallbackMode::AllowProcessEvents` and calls `AsyncRunner::postTask`.
-- `AsyncRunner::requestTick` (`async/AsyncRunner.cpp:89-177`) schedules `tick()` via
-  `setImmediate` / `setTimeout(4ms)` / `queueMicrotask`; `tick()` calls
-  `_instance.ProcessEvents()` and **re-schedules itself while any task is "pumping"**
-  (`AsyncRunner.cpp:189-191`). This is a busy reschedule loop: wasted CPU when idle, added
-  latency, and `JSIMicrotaskDispatcher`'s `queueMicrotask` dispatch is only thread-safe when
-  called on the runtime's own thread.
-
-### Present (manual, non-standard)
-`api/GPUCanvasContext.cpp:56-65` → `SurfaceRegistry.h:116-121` → `wgpu::Surface::Present()`.
-The user must call `context.present()` after every `queue.submit` (**16 JS/TS call sites**).
-No CADisplayLink/Choreographer exists; RN's `requestAnimationFrame` is the only frame driver.
-On Apple, present also does a blocking `WaitForCommandsToBeScheduled` on the JS thread.
-
----
-
-## Target architecture
-
-Three new pieces:
-
-### A. `RuntimeScheduler` — thread-safe "post to this runtime's JS thread"
-Replaces `AsyncDispatcher` / `JSIMicrotaskDispatcher` (which use non-thread-safe
-`queueMicrotask`).
-- Interface: `void scheduleOnJS(std::function<void(jsi::Runtime&)>)`, callable from any thread.
-- **Main runtime**: wraps `react::CallInvoker::invokeAsync` (already available —
-  `apple/WebGPUModule.mm:70`, `android/cpp/cpp-adapter.cpp:25-29`).
-- **Worklet runtimes**: wraps the worklet runtime's own thread executor from
-  `react-native-worklets` 0.8.3 (**see Phase 0 spike #1**).
-- Stored per-runtime in a `RuntimeContext` (the "per-JS-thread event loop"), created on first
-  WebGPU use, torn down via the existing `RuntimeLifecycleMonitor` / `RuntimeAwareCache`
-  (`cpp/jsi/RuntimeAwareCache.h`).
-
-### B. `GpuEventLoop` — background `WaitAny` thread (no polling)
-One per `wgpu::Instance` (effectively global).
-- All async sites switch `CallbackMode::AllowProcessEvents` → **`CallbackMode::WaitAnyOnly`**,
-  returning a `wgpu::Future`.
-- A **small bounded thread pool**; each pending future is waited via
-  `instance.WaitAny(future, /*timeout*/UINT64_MAX)` on a pool thread → genuinely event-driven,
-  **zero idle CPU**, resolves the instant GPU work completes. No wake/interrupt problem (each
-  thread owns one future). **See Phase 0 spike #2.**
-- On completion the worker marshals the result and calls the owning runtime's
-  `RuntimeScheduler.scheduleOnJS` to settle the JS Promise. `AsyncTaskHandle` / `Promise`
-  settle logic is reused; `AsyncRunner` + its tick loop are deleted.
-- Fallback (if concurrent `WaitAny` on one instance is unsafe): single worker thread waiting on
-  the batched future set with a condition-variable re-arm.
-
-### C. `FrameDriver` — global vsync source for auto-present
-One UI-thread singleton; removes the need for `present()`.
-- **iOS**: `CADisplayLink` on the main run loop. **Android**: NDK
-  `AChoreographer_postFrameCallback` from C++ (API 24+, avoids JNI). **See Phase 0 spike #3.**
-- Lifecycle: started when ≥1 surface is configured, stopped at 0.
-- **Auto-present semantics** (spec-aligned "update the rendering" after rAF):
-  1. `GPUCanvasContext::getCurrentTexture()` marks its `SurfaceInfo` dirty and registers a
-     present request with `FrameDriver`, tagged with the owning runtime.
-  2. Each vsync (UI thread), `FrameDriver` dispatches each dirty context's present onto its
-     **owning runtime's `RuntimeScheduler`** — so `Surface.Present()` + the Apple Metal
-     scheduling wait run on the same thread that did `getCurrentTexture` / `submit`, preserving
-     Dawn surface thread-affinity and guaranteeing present-after-submit ordering (FIFO on that
-     loop). Clear dirty after present.
-- Offscreen path (`SurfaceRegistry` `switchToOffscreen`, `src/Offscreen.ts`) has no surface →
-  present is a no-op; tests keep reading back the CPU texture.
-
----
-
-## Phase 0 — Local spikes (DO THESE FIRST, on your machine)
-
-These de-risk the refactor before any large change. Run from repo root.
-
-```bash
-# 0. install deps (web container can't do this)
-yarn install
-```
-
-### Spike 1 — worklet-runtime scheduler (HIGHEST RISK)
-Goal: obtain a **thread-safe** "schedule this lambda on runtime R's thread" for an arbitrary
-worklet runtime (UI runtime + a `createWorkletRuntime` runtime) using
-`react-native-worklets@0.8.3`.
-
-```bash
-# inspect the worklets native API actually shipped at 0.8.3
-find node_modules/react-native-worklets -name "*.h" | grep -iE "Runtime|Scheduler|Invoker|Queue"
-# look for: WorkletRuntime, RuntimeManager / WorkletsModuleProxy, UIScheduler / JSScheduler,
-# and any per-runtime executor / async queue we can call from a background C++ thread.
-```
-Deliverable: a one-paragraph note on the exact symbol(s) to use (or "not exposed → needs JS
-shim / worklets PR"). This determines whether Phase 3 (first-class worklet runtimes) is cheap
-or needs a workaround.
-
-### Spike 2 — concurrent `WaitAny` on one Dawn instance
-Goal: confirm multiple threads can each call `instance.WaitAny(singleFuture, UINT64_MAX)`
-concurrently on the **same** instance safely. If not, switch `GpuEventLoop` to the
-single-worker + condition-variable fallback.
-- Search Dawn headers/docs in `externals/dawn` (or built `libs/`) for `WaitAny` threading
-  guarantees. A tiny throwaway C++ test against the built Dawn is ideal.
-
-### Spike 3 — Android frame callback
-Goal: confirm NDK `AChoreographer_postFrameCallback` is usable at the project `minSdk`
-(`packages/webgpu/android/build.gradle`). If `minSdk < 24` for that API, plan the Java
-`Choreographer` + JNI bridge instead.
-
----
-
-## Phase 0 — Findings (completed 2026-06-02, branch `claude/keen-darwin-xeywa`)
-
-Environment verified: `node_modules` installed, `externals/dawn` present, RN **0.81.4**,
-`react-native-worklets` **0.8.3**, Android `minSdk` **26**, NDK 26/27 available.
-
-### Spike 1 — worklet-runtime scheduler → **GREEN (symbol exists, thread-safe)**
-`worklets/WorkletRuntime/WorkletRuntime.h` exposes exactly what we need:
-- `WorkletRuntime::schedule(std::function<void(jsi::Runtime &)> job)` — posts `job` onto the
-  runtime's own `AsyncQueue` (`WorkletRuntime.cpp:211-227`). It is **callable from any thread**
-  (the underlying `AsyncQueueImpl` is a mutex+condvar queue; `AsyncQueueUI` forwards to the
-  `UIScheduler`). The job runs on the runtime's event-loop thread, under `runtimeMutex_`, and
-  uses `weak_from_this()` so it is a **safe no-op if the runtime was torn down**. This is a
-  drop-in for `RuntimeScheduler::scheduleOnJS` for worklet runtimes.
-- `WorkletRuntime::getWeakRuntimeFromJSIRuntime(jsi::Runtime &rt)` (RN ≥ 0.81, we have 0.81.4)
-  maps a bare `jsi::Runtime&` → `weak_ptr<WorkletRuntime>`, so the per-runtime
-  `RuntimeContext` can recover the scheduler from any worklet runtime (UI + dedicated
-  `createWorkletRuntime`) with no JS shim.
-
-**Caveat (build wiring, not API):** webgpu does **not** currently link worklets natively
-(no worklets entry in `packages/webgpu/*.podspec` or `android/CMakeLists.txt`; only JS-level
-serialization helpers exist). Phase 3 must add the native dependency:
-- iOS: depend on `RNWorklets` pod (it ships public headers under `worklets/`,
-  `header_dir = "worklets"`).
-- Android: import the worklets **prefab** module `worklets` (`prefabPublishing` is on in
-  `react-native-worklets/android/build.gradle`).
-Worklets is already a `peerDependency`, so this adds no new install. Phase 3 stays cheap; no
-worklets PR or JS shim needed.
-
-### Spike 2 — concurrent `WaitAny` on one instance → **GREEN (designed for it)**
-Dawn's native `EventManager` (`externals/dawn/src/dawn/native/EventManager.{h,cpp}`) is built
-for multi-threaded waits:
-- State is `MutexProtected<EventState>`; `mNextFutureID` is atomic; a code comment
-  (`EventManager.h:78-82`) explicitly notes "another thread can race to complete the event …
-  via a WaitAny call".
-- Each `WaitAny` call with a non-zero timeout creates a **stack-local `Waiter`** with its **own**
-  `MutexCondVarProtected<bool>` (`EventManager.cpp:338`, `:106`), registers it per-FutureID in
-  the shared map, then blocks on its own condvar. `SetFutureReady` signals the registered
-  waiters. → **N threads can each block in `WaitAny` on the same instance concurrently, each
-  owning its own future.** This is exactly the plan's primary "one future per pool thread" model.
-
-**Hard constraint discovered (`EventManager.cpp:341-354`):** within a *single* `WaitAny` call
-with a non-zero timeout, you may **not** mix events from multiple queues, nor a queue event
-together with a non-queue event — it returns `WaitStatus::Error` ("Mixed source waits with
-timeouts are not currently supported"). Note `mapAsync`/`onSubmittedWorkDone` are *queue*
-events while `requestAdapter`/`requestDevice`/`createPipelineAsync`/`popErrorScope` are
-*non-queue* events.
-→ **Implication:** adopt the **per-future-per-thread** design (each pool thread waits on exactly
-one future) — it is single-source and always legal. The plan's stated fallback ("single worker
-waiting on the batched future set") is **not viable** as written, because batching mixed sources
-hits this restriction. If a bounded pool is undesirable, the correct fallback is one
-worker-thread *per future* (still single-source), not one worker for a batched set.
-
-### Spike 3 — Android frame callback → **GREEN (no JNI bridge needed)**
-In `android/choreographer.h`, `AChoreographer_getInstance()` and
-`AChoreographer_postFrameCallback()` are both `__INTRODUCED_IN(24)`; `minSdk` is **26**, so the
-pure-NDK path works with no Java `Choreographer`/JNI bridge.
-- `postFrameCallback` is `__DEPRECATED_IN(29)` in favor of `postFrameCallback64` (API 29) /
-  `postVsyncCallback` (API 33). Recommendation: call `postFrameCallback64` when
-  `android_get_device_api_level() >= 29`, else `postFrameCallback` (works on 26-28). Both are
-  acceptable; the 64-bit variant just avoids the deprecation warning and 32-bit time wrap.
-- `AChoreographer_getInstance()` must be called on a thread with a `Looper` (the main/UI
-  thread) — `FrameDriver` already lives on the UI thread, so this is satisfied.
-
-### Net go/no-go
-All three risks clear. Proceed to Phase 1. Two plan amendments: (1) Phase 3 must add the
-worklets native build dependency (podspec + prefab); (2) `GpuEventLoop` must use
-per-future-per-thread waits (drop the batched-future fallback).
-
-## Implementation phases (after Phase 0)
-
-**Phase 1 — Event-driven async** (no public API change; `present()` untouched) — **DONE**
-- Add `RuntimeScheduler` (+ main-runtime CallInvoker impl) and `GpuEventLoop`.
-- Switch all 7 async sites to `WaitAnyOnly` + `GpuEventLoop.addFuture(...)`:
-  `api/GPU.cpp`, `api/GPUAdapter.cpp`, `api/GPUDevice.cpp` (×3), `api/GPUBuffer.cpp`,
-  `api/GPUQueue.cpp`, `api/GPUShaderModule.cpp`.
-- Delete `async/AsyncRunner.*` polling + `async/JSIMicrotaskDispatcher.*`; keep
-  `AsyncTaskHandle` / `Promise` settle path on the new scheduler.
-
-### Phase 1 — what shipped (branch `claude/keen-darwin-xeywa`)
-New files (`cpp/rnwgpu/async/`):
-- `RuntimeScheduler.h` — interface `scheduleOnJS(std::function<void(jsi::Runtime&)>)`,
-  callable from any thread.
-- `CallInvokerScheduler.{h,cpp}` — main-runtime impl wrapping
-  `react::CallInvoker::invokeAsync(CallFunc&&)` (RN 0.81 delivers the job on the JS thread
-  with the runtime).
-- `GpuEventLoop.{h,cpp}` — background `WaitAny` driver. Lazily-grown bounded worker pool
-  (cap = `clamp(hardware_concurrency, 2, 8)`); each worker does a single-future
-  `instance.WaitAny(future, UINT64_MAX)` (always a legal single-source wait, per Phase 0
-  spike 2). Shared state held behind a `shared_ptr` so detached workers (and the
-  `wgpu::Instance` ref they need) outlive the object safely; teardown sets `running=false`
-  and notifies idle workers without joining in-flight GPU waits.
-
-Deviations from the original plan (intentional):
-1. **`AsyncRunner` was replaced by `RuntimeContext`** (`async/RuntimeContext.{h,cpp}`), the
-   per-runtime coordinator the plan's Target-architecture §A already named. It bundles
-   `{RuntimeScheduler, GpuEventLoop}` and exposes `postTask`; all polling internals
-   (`tick`/`requestTick`/`ProcessEvents`/pump counters) are gone. `AsyncTaskHandle` depends
-   only on `RuntimeScheduler`. The old `AsyncRunner` name/files no longer exist anywhere
-   (the 6 `api/*` classes now hold `std::shared_ptr<async::RuntimeContext> _async`); the dead
-   `GPU::getAsyncRunner()` accessor was deleted.
-2. **`postTask`'s callback now returns a `wgpu::Future`** (the value returned by the Dawn
-   `WaitAnyOnly` call), which `AsyncRunner` hands to `GpuEventLoop.addFuture`. A returned
-   future with `id == 0` means "no event to wait on" and is ignored — used by
-   `GPUDevice::getLost` (resolved synchronously or later via `notifyDeviceLost`). This
-   replaced the old `keepPumping` bool argument, which is gone.
-
-`GPU`'s constructor now takes the `CallInvoker` (threaded through from `RNWebGPUManager`,
-which already held it) to build the `CallInvokerScheduler`. `AsyncDispatcher.h` and
-`JSIMicrotaskDispatcher.{h,cpp}` deleted; `android/CMakeLists.txt` updated (iOS podspec
-globs `cpp/**` so it needs no change).
-
-Validation run locally: all changed + new TUs syntax-check under the Android NDK toolchain;
-the full `react-native-wgpu` native lib **compiles and links** for `arm64-v8a` (ninja);
-`cpplint` clean (project filters); `clang-format` (pinned 15.0.0) applied; `yarn tsc` passes
-(no TS changed). On-device runtime behaviour (frame pacing, zero idle CPU) is Phase 4.
-
-**Phase 2 — Auto-present + remove `present()`** — **DONE**
-- Add `FrameDriver` (iOS `CADisplayLink`, Android `AChoreographer`); wire
-  `getCurrentTexture` → register; vsync → dispatch present to owning runtime.
-- Remove `GPUCanvasContext::present` (`api/GPUCanvasContext.h:50,58`, `.cpp:56-65`) and
-  `SurfaceInfo::present` (`SurfaceRegistry.h:116-121`).
-- JS: drop `present` from `RNCanvasContext` (`src/Canvas.tsx:22-24`, `src/types.ts`).
-- Migrate all 16 example / `useWebGPU` call sites + `README.md` + `packages/webgpu/README.md`.
-
-### Phase 2 — what shipped (branch `claude/keen-darwin-xeywa`)
-New files:
-- `cpp/rnwgpu/FrameDriver.{h,cpp}` — global vsync auto-present coordinator. `requestPresent`
-  (from `getCurrentTexture`, JS thread) coalesces per `contextId`; `onVSync` (UI thread)
-  dispatches each pending surface's present onto its owning runtime's `RuntimeScheduler`
-  (`surface->presentFrame()`). Request-driven: starts the platform vsync on first request,
-  stops after `kMaxIdleFrames` (3) idle frames → zero idle CPU.
-- `apple/WebGPUFrameDriver.{h,mm}` — iOS/tvOS `CADisplayLink` on the main run loop (paused
-  toggled by start/stop). macOS uses `NSScreen.displayLinkWithTarget:` on 14+, else an
-  `NSTimer` fallback. Selector → `FrameDriver::onVSync()`.
-- `android/.../com/webgpu/WebGPUFrameDriver.java` — main-thread `Choreographer` driver;
-  `doFrame` → static `nativeOnVSync()` JNI → `FrameDriver::onVSync()`, reposts while running.
-
-Wiring:
-- `SurfaceInfo::present()` → `presentFrame()` (Apple `WaitForCommandsToBeScheduled` + Present,
-  no-op offscreen); added `SurfaceInfo::hasSurface()`. Metal extern moved to `SurfaceRegistry.h`.
-- `GPU::getContext()` re-exposes the per-runtime `RuntimeContext` (so the canvas can reach its
-  scheduler). `GPUCanvasContext` stores `_contextId`, registers the present in
-  `getCurrentTexture` (and now sets the canvas client size there), and dropped `present()` +
-  its JS binding.
-- iOS `WebGPUModule install` and Android `initializeNative` register `setPlatformVSync`. View
-  teardown (`MetalView dealloc`, Android `onSurfaceDestroy`) calls `FrameDriver::cancelPresent`.
-- JS: `RNCanvasContext` is now just `GPUCanvasContext` (`src/Canvas.tsx`, `src/types.ts`);
-  removed the no-op `present` from `Offscreen.ts` and `WebPolyfillGPUModule.ts`. 18 example
-  call sites (the plan's 16 + `VisionCamera`, `ImportExternalTexture`) and both READMEs migrated.
-
-Decisions / deviations:
-1. **Android vsync = Java `Choreographer` + JNI** (not pure NDK `AChoreographer`), chosen for
-   robustness — pure NDK needs a JNI hop to a Looper thread to bootstrap anyway. Confirmed with
-   the user.
-2. **`present()` hard-removed** (breaking), confirmed with the user.
-3. **Owning-runtime caveat (→ Phase 3):** `getCurrentTexture` currently dispatches present via
-   the **main** runtime's scheduler (`_gpu->getContext()`). Correct for main-JS rendering. The
-   Reanimated example renders on the **UI (worklet) runtime**, so its present is migrated (call
-   removed) but auto-present won't target the correct thread until Phase 3 tags the present with
-   the *calling* runtime and gives worklet runtimes their own `RuntimeScheduler`. Expect the
-   Reanimated/Dedicated examples to be visually broken between Phase 2 and Phase 3.
-
-Validation (local): `react-native-wgpu` native lib **compiles and links** for `arm64-v8a`
-(ninja, CMake picked up `FrameDriver.cpp`); `cpplint` clean; `clang-format` applied; `yarn tsc`
-and `yarn lint` pass for both `packages/webgpu` and `apps/example`. iOS `.mm` and the Java
-driver are not compiled locally (no iOS/gradle build run here) — review-only; needs a device
-build. On-device frame pacing / zero-idle-CPU verification is Phase 4.
-
-**Phase 3 — First-class worklet runtimes** — **DONE**
-- Worklet-runtime `RuntimeScheduler` impl (per Spike 1); verify auto-present dispatch on UI +
-  dedicated runtimes; update `apps/example/src/Reanimated/Reanimated.tsx` (drop `present()`,
-  keep its own rAF loop).
-
-### Phase 3 — what shipped (branch `claude/keen-darwin-xeywa`)
-Observed after Phase 2: the **UI-runtime** Reanimated example worked (the Reanimated UI runtime
-executes on the **main thread**, so dispatching its present to the main runtime's scheduler
-happened to land on the right thread), but the **dedicated `createWorkletRuntime`** example
-(`Reanimated/DedicatedThread.tsx`, `runOnRuntime`) crashed — its render thread is its own, so a
-main-thread present violated Dawn surface thread-affinity.
-
-**Decision (confirmed with the user): auto-present on the JS + UI runtimes, explicit
-`ctx.present()` on dedicated worklet runtimes. No native worklets dependency.** Rather than link
-`react-native-worklets` natively and dispatch via `WorkletRuntime::schedule` (the original plan /
-Spike 1 primary), the FrameDriver covers the JS and UI runtimes; dedicated runtimes — which run
-on their own thread with no safe scheduler/vsync hook — keep an explicit `present()`. (A
-scheduler-free auto path for dedicated runtimes was prototyped but rejected — see below — because
-it added one frame of latency and never presented a one-shot frame.) This needs no new native
-build dependency and is fully buildable/validatable locally.
-
-Implementation:
-- `GPUCanvasContext::getCurrentTexture` switched to the full-control HostFunction signature
-  (`jsi::Value(rt, thisVal, args, count)`, same pattern as `RNWebGPU::createImageBitmap`) so it
-  learns the **calling** runtime. Present routing:
-  - **Main runtime** (`RuntimeContext::get(runtime)` is non-null): unchanged — register with the
-    global vsync `FrameDriver` using that runtime's scheduler.
-  - **Reanimated UI runtime** (`globalThis.__RUNTIME_KIND === 2`, worklets' `RuntimeKind::UI`):
-    also auto-present via the FrameDriver + main scheduler. The UI runtime is reached correctly
-    by this path (Phase 2 confirmed it), so no `present()` is needed.
-  - **Dedicated worklet runtimes** (`RuntimeKind::Worker`, or any untagged/unknown worklet
-    runtime — e.g. Vision Camera frame processors): **explicit `ctx.present()`**, kept in the
-    public API for exactly this case. They run on their own thread with no safe scheduler/vsync
-    hook, so present is called synchronously by the author after `submit`, on that thread
-    (preserving Dawn surface thread-affinity).
-
-  `ctx.present()` is a **no-op on the JS / UI runtime** (they auto-present), which makes it safe
-  to call from a worklet shared between the UI and a dedicated runtime (the example's
-  `webGPUDemo`). Runtime classification uses `RuntimeContext::get(rt)` (main) and the stable
-  worklets global `__RUNTIME_KIND` (`ReactNative=1`, `UI=2`, `Worker=3`); no worklets headers
-  are linked.
-
-  Two scheduler-based approaches were tried and rejected before landing here: (1)
-  `queueMicrotask` is **disabled** on worklet runtimes (throws); (2) `setImmediate`/`setTimeout`
-  exist but route through the runtime's `EventLoop` `AsyncQueue`, which for **Vision Camera** is
-  a custom `NativeThreadAsyncQueue` that hops through JNI (`fbjni Environment::current()`) and
-  **crashes** when pushed from a non-JVM-attached thread. A scheduler-free
-  "present-on-next-acquire" fallback worked everywhere but added one frame of latency and never
-  presented a one-shot frame, so the explicit-`present()`-on-dedicated split was chosen instead.
-- JS surface: `present()` re-added to `RNCanvasContext` (`src/Canvas.tsx`, `src/types.ts`,
-  documented dedicated-only) and as a no-op on `Offscreen.ts` / `WebPolyfillGPUModule.ts`. Native
-  `GPUCanvasContext::present` re-added (full-control signature; no-op on auto-presented runtimes).
-- Examples: `present()` re-added to `Reanimated/Reanimated.tsx`'s shared `webGPUDemo` (no-op on
-  UIThread, real on DedicatedThread) and to `VisionCamera.tsx`'s frame processor. Both READMEs'
-  "Frame Scheduling" sections document the JS/UI-auto vs dedicated-manual split.
-
-Known limitation (out of scope, examples don't hit it): **async ops** (`mapAsync`,
-`onSubmittedWorkDone`, …) invoked *on a worklet runtime* still settle their Promise via the
-object's creation-runtime context (main), not the calling worklet runtime — the example worklets
-only do synchronous rendering + present (device/adapter are created on the main runtime). Routing
-async settlement to the calling runtime would need the same calling-runtime detection applied to
-the 7 async sites; deferred until a use case needs it.
-
-Validation (local): native lib **compiles + links** for `arm64-v8a`; `cpplint` clean;
-`clang-format` applied; `yarn tsc`/`yarn lint` unaffected (no JS changed). On-device
-verification of the dedicated-worklet example is for the maintainer.
-
-**Phase 4 — `SurfaceRegistry` / surface-model rework** (proposed)
-The `SurfaceInfo` / `SurfaceRegistry` model (`cpp/rnwgpu/SurfaceRegistry.h`) predates the
-event-driven + auto-present work and is now the rough edge. Candidate improvements to scope:
-- **Surface thread-affinity.** Surface lifecycle (`configure`/`switchToOnscreen`/
-  `switchToOffscreen`/`resize`) runs on the **UI thread** (native view callbacks) while
-  `getCurrentTexture`/`presentFrame` run on the **owning runtime's render thread**. A single
-  `shared_mutex` serializes them but they're still cross-thread against a Dawn surface that
-  prefers single-thread access. Consider routing all surface ops through the owning runtime
-  (e.g. via the `RuntimeScheduler`), making affinity structural rather than lock-guarded.
-- **State clarity.** The on-screen-`surface` vs offscreen-`texture` duality is encoded as
-  `if (surface) … else …` branches throughout; a small explicit state (Offscreen / Onscreen)
-  would remove the implicit coupling and the `switchToOnscreen` flush path's validation cost
-  (its existing `// TODO: faster way without validation?`).
-- **Dead/again-evaluated fields.** e.g. the stored `wgpu::Instance gpu` member appears unused;
-  audit members now that present/`hasSurface` were added.
-- **Lifetime vs `contextId`.** Registry keyed by a JS-side incrementing `int`; `FrameDriver`
-  now also keys pending presents by `contextId`. Confirm teardown ordering (view dealloc →
-  `cancelPresent` + `removeSurfaceInfo`) is race-free under the new threading.
-
-**Phase 5 — Validation**
-```bash
-yarn tsc && yarn lint
-yarn workspace react-native-wgpu test         # offscreen readback + demo specs
-yarn build:ios        # or: yarn workspace example ios
-yarn build:android    # or: yarn workspace example android
-```
-Verify: no idle-CPU polling (logging), correct frame pacing, no present-ordering glitches,
-Reanimated UI/Dedicated examples render.
-
----
-
-## 16 `present()` call sites to migrate (Phase 2)
-
-```
-apps/example/src/StorageBufferVertices/StorageBufferVertices.tsx
-apps/example/src/components/useWebGPU.ts
-apps/example/src/components/Texture.tsx
-apps/example/src/SharedTextureMemory/SharedTextureMemory.tsx
-apps/example/src/ThreeJS/Helmet.tsx
-apps/example/src/ComputeToys/engine/index.ts
-apps/example/src/CanvasAPI/CanvasAPI.tsx
-apps/example/src/ThreeJS/PostProcessing.tsx
-apps/example/src/ThreeJS/Cube.tsx
-apps/example/src/Triangle/HelloTriangle.tsx
-apps/example/src/Triangle/HelloTriangleMSAA.tsx
-apps/example/src/ThreeJS/InstancedMesh.tsx
-apps/example/src/ThreeJS/Retargeting.tsx
-apps/example/src/ThreeJS/components/FiberCanvas.tsx
-apps/example/src/Reanimated/Reanimated.tsx
-apps/example/src/ThreeJS/Backdrop.tsx
-```
-Plus `README.md` and `packages/webgpu/README.md`.
-
----
-
-## Risks / open questions
-- **Worklet-runtime scheduler** access in worklets 0.8.3 (Spike 1 — highest risk).
-- **Concurrent `WaitAny`** semantics on one Dawn instance (Spike 2; single-worker fallback ready).
-- **Present timing**: vsync-dispatched-to-owning-loop must land after submit (FIFO on that loop)
-  and before the next `getCurrentTexture`.
-- **Breaking change**: `present()` removed — type, examples, README updated together.
-- **Apple Metal wait** moves into the frame-boundary present task, off the synchronous call path.
-
----
-
-## How to resume locally
-
-```bash
-git fetch origin claude/keen-darwin-xeywa
-git checkout claude/keen-darwin-xeywa
-git pull origin claude/keen-darwin-xeywa
-# open this file and run Phase 0 spikes, then start Claude Code:
-#   claude
-# suggested kickoff prompt:
-#   "Read docs/refactor-async-present-plan.md. Run the Phase 0 spikes and report
-#    findings before implementing. Develop on this branch."
-```