From 3a26c70ae18bc9ff41820f6d11b306e9af775fa7 Mon Sep 17 00:00:00 2001 From: Phat Pham Date: Tue, 23 Jun 2026 23:45:16 +0700 Subject: [PATCH] feat: add weekly per-source item-retention prune cron Adds a second Cron Trigger (Sunday 23:00 ICT) that deletes all but the most recent FEEDREADER_MAX_ITEMS_PER_SOURCE (default 1000) items per source, keeping the items table and every full-table-scan query against it bounded as more sources get added. 1000/source was sized against D1's rows-read quota, not disk: live prod data showed ~700-750 bytes/row and per-source velocity slow enough that even 4 sources at the cap stay well under D1 Free's 500MB limit, but listFeedItems reads sources*cap rows per unfiltered request with no SQL LIMIT, which is the actual scaling risk against the 5M-rows/day free budget. Co-Authored-By: Claude Sonnet 4.6 --- core/ports.ts | 6 ++++++ docs/RUNBOOK.md | 6 ++++++ platforms/cloudflare/src/config.ts | 9 +++++++++ platforms/cloudflare/src/env.d.ts | 1 + platforms/cloudflare/src/index.ts | 18 +++++++++++++++++- platforms/cloudflare/src/repository.ts | 23 +++++++++++++++++++++++ platforms/cloudflare/wrangler.toml | 16 ++++++++++++---- 7 files changed, 74 insertions(+), 5 deletions(-) diff --git a/core/ports.ts b/core/ports.ts index 44bca93..45f40dc 100644 --- a/core/ports.ts +++ b/core/ports.ts @@ -28,4 +28,10 @@ export interface FeedRepository { searchQuery: string, ): Promise; countTotalItems(): Promise; + /** + * Deletes all but the `maxPerSource` most recent items per source (same + * effective ordering as the feed itself), so the table can't grow + * unbounded. Returns the number of rows deleted. + */ + pruneOldItems(maxPerSource: number): Promise; } diff --git a/docs/RUNBOOK.md b/docs/RUNBOOK.md index dbf187d..779b49f 100644 --- a/docs/RUNBOOK.md +++ b/docs/RUNBOOK.md @@ -35,6 +35,12 @@ npm run db:migrate:remote # deployed D1 database New migrations go in `platforms/cloudflare/migrations/` as `NNNN_description.sql`, following on from `0001_init.sql`. +## Weekly item-retention prune + +A second cron entry (`0 16 * * 0`, Sunday 23:00 ICT) fires `scheduled()` with `event.cron` set to that string, which routes to `D1Repository.pruneOldItems` instead of the hourly refresh fan-out. It deletes all but the `FEEDREADER_MAX_ITEMS_PER_SOURCE` (default 1000) most recent items per source, ordered the same way the feed itself sorts (`coalesce(published_at, first_seen_at) DESC, ...`). + +1000/source was sized against rows-read cost, not disk: `listFeedItems` reads every row matching its WHERE clause with no SQL `LIMIT` (sorting/pagination happens in memory — see `core/sources/listInMemory.ts`), so an unfiltered home-page hit reads `sources × cap` rows from D1 every time. At 4 sources × 1000 that's 4,000 rows/request — cheap in isolation, but worth keeping in mind against D1 Free's 5M-rows-read/day budget if traffic grows or source count grows well past 4. Raising the cap or adding more sources should come with either moving sort/pagination into SQL (the existing `idx_items_feed_order` index already matches the sort order but is unused by the current query shape) or checking D1 Free's rows-read budget isn't at risk. + ## Reading cron execution history Use Cloudflare dashboard → Workers & Pages → `feedreader` → Triggers → Cron Triggers, or `wrangler tail` while a scheduled run is expected, to confirm `sync_state.last_attempt_at` is updating without user-triggered traffic. diff --git a/platforms/cloudflare/src/config.ts b/platforms/cloudflare/src/config.ts index 041b518..d30613b 100644 --- a/platforms/cloudflare/src/config.ts +++ b/platforms/cloudflare/src/config.ts @@ -6,6 +6,7 @@ import type { Env } from "./env.d.ts"; export interface Config { itemsPerSource: number; + maxItemsPerSource: number; userAgent: string; } @@ -14,11 +15,19 @@ export function loadConfig(env: Env): Config { env.FEEDREADER_ITEMS_PER_SOURCE ?? "", 10, ); + const maxItemsPerSource = Number.parseInt( + env.FEEDREADER_MAX_ITEMS_PER_SOURCE ?? "", + 10, + ); return { itemsPerSource: Number.isFinite(itemsPerSource) && itemsPerSource > 0 ? itemsPerSource : 20, + maxItemsPerSource: + Number.isFinite(maxItemsPerSource) && maxItemsPerSource > 0 + ? maxItemsPerSource + : 1000, userAgent: env.FEEDREADER_USER_AGENT?.trim() || "feedreader/0.1", }; } diff --git a/platforms/cloudflare/src/env.d.ts b/platforms/cloudflare/src/env.d.ts index 919a4f8..0e25dd6 100644 --- a/platforms/cloudflare/src/env.d.ts +++ b/platforms/cloudflare/src/env.d.ts @@ -9,4 +9,5 @@ export interface Env { APP_VERSION?: string; FEEDREADER_ITEMS_PER_SOURCE?: string; FEEDREADER_USER_AGENT?: string; + FEEDREADER_MAX_ITEMS_PER_SOURCE?: string; } diff --git a/platforms/cloudflare/src/index.ts b/platforms/cloudflare/src/index.ts index 3420ea9..c41907f 100644 --- a/platforms/cloudflare/src/index.ts +++ b/platforms/cloudflare/src/index.ts @@ -15,6 +15,7 @@ import { import { build, type Source } from "../../../core/sources/index.ts"; import { renderIndexPage } from "../../../core/render.ts"; import { D1Repository } from "./repository.ts"; +import { loadConfig } from "./config.ts"; import type { Env } from "./env.d.ts"; const PAGE_SIZE = 12; @@ -25,6 +26,11 @@ const KNOWN_SOURCES = new Set([ "alphaxiv", ]); +// Second [triggers] cron in wrangler.toml — Sunday 23:00 ICT (Asia/Ho_Chi_Minh, +// UTC+7, no DST) = Sunday 16:00 UTC. Fires alongside (not instead of) the +// hourly refresh cron; event.cron tells scheduled() which one triggered. +const WEEKLY_PRUNE_CRON = "0 16 * * 0"; + // Backstop only — the cache key already changes whenever the underlying // source data refreshes (see latestSuccessAt), so this just bounds // staleness if a source somehow stops refreshing. @@ -55,7 +61,17 @@ export default { return new Response("not found", { status: 404 }); }, - async scheduled(_event: ScheduledController, env: Env): Promise { + async scheduled(event: ScheduledController, env: Env): Promise { + if (event.cron === WEEKLY_PRUNE_CRON) { + const { maxItemsPerSource } = loadConfig(env); + const deleted = await new D1Repository(env.DB).pruneOldItems( + maxItemsPerSource, + ); + console.log( + `pruneOldItems: deleted ${deleted} item(s) beyond ${maxItemsPerSource} per source`, + ); + return; + } await fanOutRefresh(env, build()); }, }; diff --git a/platforms/cloudflare/src/repository.ts b/platforms/cloudflare/src/repository.ts index 6da4ef2..32f4d89 100644 --- a/platforms/cloudflare/src/repository.ts +++ b/platforms/cloudflare/src/repository.ts @@ -199,6 +199,29 @@ export class D1Repository implements FeedRepository { .first<{ count: number }>(); return row?.count ?? 0; } + + async pruneOldItems(maxPerSource: number): Promise { + const { meta } = await this.db + .prepare( + ` + DELETE FROM items + WHERE id IN ( + SELECT id FROM ( + SELECT id, ROW_NUMBER() OVER ( + PARTITION BY source + ORDER BY coalesce(published_at, first_seen_at) DESC, first_seen_at DESC, + source_rank ASC, source ASC, external_id ASC + ) AS rn + FROM items + ) + WHERE rn > ? + ) + `, + ) + .bind(maxPerSource) + .run(); + return meta.changes ?? 0; + } } function rowToFeedItem(row: ItemRow): FeedItem { diff --git a/platforms/cloudflare/wrangler.toml b/platforms/cloudflare/wrangler.toml index a8951c7..86ef480 100644 --- a/platforms/cloudflare/wrangler.toml +++ b/platforms/cloudflare/wrangler.toml @@ -17,11 +17,15 @@ enabled = true ip = "127.0.0.1" port = 8788 -# Hourly, UTC. Asia/Ho_Chi_Minh (UTC+7, no DST) hourly wall-clock boundaries -# are the same instants as UTC hourly wall-clock boundaries, so no offset -# math is needed (see docs/RUNBOOK.md). +# Hourly refresh, UTC. Asia/Ho_Chi_Minh (UTC+7, no DST) hourly wall-clock +# boundaries are the same instants as UTC hourly wall-clock boundaries, so no +# offset math is needed for the hourly cron (see docs/RUNBOOK.md). +# +# Second entry: weekly item-retention prune, Sunday 23:00 ICT = Sunday 16:00 +# UTC (fixed +7h offset, no DST). Routed in src/index.ts's scheduled() by +# matching event.cron against this exact string. [triggers] -crons = ["0 * * * *"] +crons = ["0 * * * *", "0 16 * * 0"] [assets] directory = "../../web-static" @@ -44,6 +48,10 @@ service = "feedreader" APP_VERSION = "dev" FEEDREADER_ITEMS_PER_SOURCE = "20" FEEDREADER_USER_AGENT = "feedreader/0.1" +# Per-source row cap enforced by the weekly prune cron (see [triggers] above +# and docs/RUNBOOK.md) — keeps the items table, and every full-table-scan +# query against it, bounded as more sources are added. +FEEDREADER_MAX_ITEMS_PER_SOURCE = "1000" # REFRESH_SECRET is a secret, not a var — set it with: # wrangler secret put REFRESH_SECRET --config platforms/cloudflare/wrangler.toml