Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions core/ports.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,10 @@ export interface FeedRepository {
searchQuery: string,
): Promise<FeedItem[]>;
countTotalItems(): Promise<number>;
/**
* Deletes all but the `maxPerSource` most recent items per source (same
* effective ordering as the feed itself), so the table can't grow
* unbounded. Returns the number of rows deleted.
*/
pruneOldItems(maxPerSource: number): Promise<number>;
}
6 changes: 6 additions & 0 deletions docs/RUNBOOK.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ npm run db:migrate:remote # deployed D1 database

New migrations go in `platforms/cloudflare/migrations/` as `NNNN_description.sql`, following on from `0001_init.sql`.

## Weekly item-retention prune

A second cron entry (`0 16 * * 0`, Sunday 23:00 ICT) fires `scheduled()` with `event.cron` set to that string, which routes to `D1Repository.pruneOldItems` instead of the hourly refresh fan-out. It deletes all but the `FEEDREADER_MAX_ITEMS_PER_SOURCE` (default 1000) most recent items per source, ordered the same way the feed itself sorts (`coalesce(published_at, first_seen_at) DESC, ...`).

1000/source was sized against rows-read cost, not disk: `listFeedItems` reads every row matching its WHERE clause with no SQL `LIMIT` (sorting/pagination happens in memory — see `core/sources/listInMemory.ts`), so an unfiltered home-page hit reads `sources × cap` rows from D1 every time. At 4 sources × 1000 that's 4,000 rows/request — cheap in isolation, but worth keeping in mind against D1 Free's 5M-rows-read/day budget if traffic grows or source count grows well past 4. Raising the cap or adding more sources should come with either moving sort/pagination into SQL (the existing `idx_items_feed_order` index already matches the sort order but is unused by the current query shape) or checking D1 Free's rows-read budget isn't at risk.

## Reading cron execution history

Use Cloudflare dashboard → Workers & Pages → `feedreader` → Triggers → Cron Triggers, or `wrangler tail` while a scheduled run is expected, to confirm `sync_state.last_attempt_at` is updating without user-triggered traffic.
Expand Down
9 changes: 9 additions & 0 deletions platforms/cloudflare/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import type { Env } from "./env.d.ts";

export interface Config {
itemsPerSource: number;
maxItemsPerSource: number;
userAgent: string;
}

Expand All @@ -14,11 +15,19 @@ export function loadConfig(env: Env): Config {
env.FEEDREADER_ITEMS_PER_SOURCE ?? "",
10,
);
const maxItemsPerSource = Number.parseInt(
env.FEEDREADER_MAX_ITEMS_PER_SOURCE ?? "",
10,
);
return {
itemsPerSource:
Number.isFinite(itemsPerSource) && itemsPerSource > 0
? itemsPerSource
: 20,
maxItemsPerSource:
Number.isFinite(maxItemsPerSource) && maxItemsPerSource > 0
? maxItemsPerSource
: 1000,
userAgent: env.FEEDREADER_USER_AGENT?.trim() || "feedreader/0.1",
};
}
1 change: 1 addition & 0 deletions platforms/cloudflare/src/env.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ export interface Env {
APP_VERSION?: string;
FEEDREADER_ITEMS_PER_SOURCE?: string;
FEEDREADER_USER_AGENT?: string;
FEEDREADER_MAX_ITEMS_PER_SOURCE?: string;
}
18 changes: 17 additions & 1 deletion platforms/cloudflare/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
import { build, type Source } from "../../../core/sources/index.ts";
import { renderIndexPage } from "../../../core/render.ts";
import { D1Repository } from "./repository.ts";
import { loadConfig } from "./config.ts";
import type { Env } from "./env.d.ts";

const PAGE_SIZE = 12;
Expand All @@ -25,6 +26,11 @@ const KNOWN_SOURCES = new Set([
"alphaxiv",
]);

// Second [triggers] cron in wrangler.toml — Sunday 23:00 ICT (Asia/Ho_Chi_Minh,
// UTC+7, no DST) = Sunday 16:00 UTC. Fires alongside (not instead of) the
// hourly refresh cron; event.cron tells scheduled() which one triggered.
const WEEKLY_PRUNE_CRON = "0 16 * * 0";

// Backstop only — the cache key already changes whenever the underlying
// source data refreshes (see latestSuccessAt), so this just bounds
// staleness if a source somehow stops refreshing.
Expand Down Expand Up @@ -55,7 +61,17 @@ export default {
return new Response("not found", { status: 404 });
},

async scheduled(_event: ScheduledController, env: Env): Promise<void> {
async scheduled(event: ScheduledController, env: Env): Promise<void> {
if (event.cron === WEEKLY_PRUNE_CRON) {
const { maxItemsPerSource } = loadConfig(env);
const deleted = await new D1Repository(env.DB).pruneOldItems(
maxItemsPerSource,
);
console.log(
`pruneOldItems: deleted ${deleted} item(s) beyond ${maxItemsPerSource} per source`,
);
return;
}
await fanOutRefresh(env, build());
},
};
Expand Down
23 changes: 23 additions & 0 deletions platforms/cloudflare/src/repository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,29 @@ export class D1Repository implements FeedRepository {
.first<{ count: number }>();
return row?.count ?? 0;
}

async pruneOldItems(maxPerSource: number): Promise<number> {
const { meta } = await this.db
.prepare(
`
DELETE FROM items
WHERE id IN (
SELECT id FROM (
SELECT id, ROW_NUMBER() OVER (
PARTITION BY source
ORDER BY coalesce(published_at, first_seen_at) DESC, first_seen_at DESC,
source_rank ASC, source ASC, external_id ASC
) AS rn
FROM items
)
WHERE rn > ?
)
`,
)
.bind(maxPerSource)
.run();
return meta.changes ?? 0;
}
}

function rowToFeedItem(row: ItemRow): FeedItem {
Expand Down
16 changes: 12 additions & 4 deletions platforms/cloudflare/wrangler.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@ enabled = true
ip = "127.0.0.1"
port = 8788

# Hourly, UTC. Asia/Ho_Chi_Minh (UTC+7, no DST) hourly wall-clock boundaries
# are the same instants as UTC hourly wall-clock boundaries, so no offset
# math is needed (see docs/RUNBOOK.md).
# Hourly refresh, UTC. Asia/Ho_Chi_Minh (UTC+7, no DST) hourly wall-clock
# boundaries are the same instants as UTC hourly wall-clock boundaries, so no
# offset math is needed for the hourly cron (see docs/RUNBOOK.md).
#
# Second entry: weekly item-retention prune, Sunday 23:00 ICT = Sunday 16:00
# UTC (fixed +7h offset, no DST). Routed in src/index.ts's scheduled() by
# matching event.cron against this exact string.
[triggers]
crons = ["0 * * * *"]
crons = ["0 * * * *", "0 16 * * 0"]

[assets]
directory = "../../web-static"
Expand All @@ -44,6 +48,10 @@ service = "feedreader"
APP_VERSION = "dev"
FEEDREADER_ITEMS_PER_SOURCE = "20"
FEEDREADER_USER_AGENT = "feedreader/0.1"
# Per-source row cap enforced by the weekly prune cron (see [triggers] above
# and docs/RUNBOOK.md) — keeps the items table, and every full-table-scan
# query against it, bounded as more sources are added.
FEEDREADER_MAX_ITEMS_PER_SOURCE = "1000"

# REFRESH_SECRET is a secret, not a var — set it with:
# wrangler secret put REFRESH_SECRET --config platforms/cloudflare/wrangler.toml