diff --git a/core/personalize/rank.ts b/core/personalize/rank.ts new file mode 100644 index 0000000..4d9b4c7 --- /dev/null +++ b/core/personalize/rank.ts @@ -0,0 +1,127 @@ +// Pure helpers for the "For You" LLM ranking feature. No I/O — adapters +// (e.g. platforms/cloudflare/src/llmRanker.ts) own the actual model call and +// use these to build the prompt and interpret the response. + +import type { FeedItem } from "../domain.ts"; + +export const MAX_INTERESTS_LENGTH = 300; +const MAX_ITEM_TEXT_LENGTH = 160; + +export function buildRankingPrompt( + items: FeedItem[], + interests: string, +): string { + const lines = items.map((item, i) => { + const title = item.title.trim().slice(0, MAX_ITEM_TEXT_LENGTH); + const summary = (item.summary ?? "").trim().slice(0, MAX_ITEM_TEXT_LENGTH); + return `${i}: ${title}${summary ? ` — ${summary}` : ""}`; + }); + return [ + "You are ranking a list of feed items by relevance to a reader's stated interests.", + `Reader interests: ${interests.trim().slice(0, MAX_INTERESTS_LENGTH)}`, + "Items (index: title — summary):", + ...lines, + "", + "Respond with ONLY a JSON array of the item indices above, ordered from most to least relevant to the interests. Include every index exactly once. No other text, no markdown.", + ].join("\n"); +} + +/** + * Defensively extracts a ranked index list from a raw model response. + * Never throws — a malformed or empty response yields []. Indices outside + * [0, itemCount) or repeated are dropped (first occurrence wins). + */ +export function parseRankedIndices(raw: string, itemCount: number): number[] { + const match = /\[[\s\S]*\]/.exec(raw); + if (!match) return []; + + let parsed: unknown; + try { + parsed = JSON.parse(match[0]); + } catch { + return []; + } + if (!Array.isArray(parsed)) return []; + + const seen = new Set(); + const out: number[] = []; + for (const value of parsed) { + const index = typeof value === "number" ? value : Number(value); + if ( + !Number.isInteger(index) || + index < 0 || + index >= itemCount || + seen.has(index) + ) { + continue; + } + seen.add(index); + out.push(index); + } + return out; +} + +/** + * Places ranked items first (in model order), then appends any remaining + * items in whatever order `items` arrived in — guarantees a full, valid + * list even when the ranker returns a partial or empty result. Callers may + * pass any base order, not just chronological — e.g. handlePersonalize + * feeds this similarity-ranked input (core/personalize/similarity.ts) when + * running the LLM as a polish pass over a pre-filtered candidate pool, and + * the "remainder" preserves that similarity order rather than reverting to + * chronological. + */ +export function mergeRankedOrder( + items: FeedItem[], + rankedIndices: number[], +): FeedItem[] { + const used = new Set(); + const out: FeedItem[] = []; + for (const index of rankedIndices) { + const item = items[index]; + if (!item || used.has(index)) continue; + used.add(index); + out.push(item); + } + for (let i = 0; i < items.length; i++) { + if (!used.has(i)) out.push(items[i]!); + } + return out; +} + +/** Stable identifier for a FeedItem — the domain type has no numeric id, + * so (source, externalId) is the natural key for referencing an item + * across requests (e.g. in a cached ranking). */ +export function itemKey(item: FeedItem): string { + return `${item.source} ${item.externalId}`; +} + +/** + * Projects a previously-computed ranked key order (from a cached ranking, + * possibly over a different/smaller pool) onto a freshly-fetched item + * list: ranked items appear first, in cached order; anything not in + * rankedKeys — new items, or items the cache doesn't cover — keeps its + * relative order from `items` at the end (which, like mergeRankedOrder, + * need not be chronological — see that function's comment). Unlike + * mergeRankedOrder, this tolerates the two lists having different lengths + * or contents, since `items` is re-fetched live while rankedKeys may be + * stale or partial. + */ +export function mergeRankedKeysOrder( + items: FeedItem[], + rankedKeys: string[], +): FeedItem[] { + const byKey = new Map(items.map((item) => [itemKey(item), item])); + const used = new Set(); + const out: FeedItem[] = []; + for (const key of rankedKeys) { + const item = byKey.get(key); + if (!item || used.has(key)) continue; + used.add(key); + out.push(item); + } + for (const item of items) { + if (!used.has(itemKey(item))) out.push(item); + } + return out; +} diff --git a/core/personalize/similarity.ts b/core/personalize/similarity.ts new file mode 100644 index 0000000..9261ed1 --- /dev/null +++ b/core/personalize/similarity.ts @@ -0,0 +1,57 @@ +// Pure helpers for ranking by embedding similarity — the retrieval half of +// the "For You" feature's retrieve-then-rerank pipeline. No I/O; adapters +// (e.g. platforms/cloudflare/src/embedder.ts) own the actual model call and +// platforms/cloudflare/src/index.ts's handlePersonalize wires this together +// with the existing LLM polish step (core/personalize/rank.ts). + +import type { FeedItem } from "../domain.ts"; +import { itemKey } from "./rank.ts"; + +export function cosineSimilarity(a: number[], b: number[]): number { + let dot = 0; + let normA = 0; + let normB = 0; + const length = Math.min(a.length, b.length); + for (let i = 0; i < length; i++) { + dot += a[i]! * b[i]!; + normA += a[i]! * a[i]!; + normB += b[i]! * b[i]!; + } + if (normA === 0 || normB === 0) return 0; + return dot / (Math.sqrt(normA) * Math.sqrt(normB)); +} + +/** + * Sorts `items` by cosine similarity to `interestsVector`, descending. + * Items with no entry in `embeddings` (not yet embedded at ingestion time, + * or embedding generation failed that cycle) sink to the end, keeping + * their relative order from `items` — a stable partition, not a random + * placement, so an all-unembedded pool degrades to a no-op rather than + * reshuffling. mergeRankedOrder/mergeRankedKeysOrder in rank.ts then treat + * this function's output as the new "base order" for the LLM polish step. + */ +export function rankBySimilarity( + items: FeedItem[], + embeddings: Map, + interestsVector: number[], +): FeedItem[] { + const scored: { item: FeedItem; score: number; index: number }[] = []; + const unscored: FeedItem[] = []; + items.forEach((item, index) => { + const vector = embeddings.get(itemKey(item)); + if (vector) { + scored.push({ + item, + score: cosineSimilarity(vector, interestsVector), + index, + }); + } else { + unscored.push(item); + } + }); + scored.sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + return a.index - b.index; + }); + return [...scored.map((entry) => entry.item), ...unscored]; +} diff --git a/core/personalize/test/fakeEmbedder.ts b/core/personalize/test/fakeEmbedder.ts new file mode 100644 index 0000000..6d273f2 --- /dev/null +++ b/core/personalize/test/fakeEmbedder.ts @@ -0,0 +1,15 @@ +import type { Embedder } from "../../ports.ts"; + +/** In-memory Embedder for tests — no network calls. `script` decides the + * returned vectors (or throws, to simulate a transport/availability + * failure), same pattern as FakeLlmRanker. */ +export class FakeEmbedder implements Embedder { + readonly calls: string[][] = []; + + constructor(private readonly script: (texts: string[]) => number[][]) {} + + async embed(texts: string[]): Promise { + this.calls.push(texts); + return this.script(texts); + } +} diff --git a/core/personalize/test/fakeLlmRanker.ts b/core/personalize/test/fakeLlmRanker.ts new file mode 100644 index 0000000..9f2725f --- /dev/null +++ b/core/personalize/test/fakeLlmRanker.ts @@ -0,0 +1,17 @@ +import type { FeedItem } from "../../domain.ts"; +import type { LlmRanker } from "../../ports.ts"; + +/** In-memory LlmRanker for tests — no network calls. `script` decides the + * returned ranking (or throws, to simulate a transport/availability failure). */ +export class FakeLlmRanker implements LlmRanker { + constructor( + private readonly script: ( + items: FeedItem[], + interests: string, + ) => number[], + ) {} + + async rank(items: FeedItem[], interests: string): Promise { + return this.script(items, interests); + } +} diff --git a/core/personalize/test/rank.spec.ts b/core/personalize/test/rank.spec.ts new file mode 100644 index 0000000..75432e3 --- /dev/null +++ b/core/personalize/test/rank.spec.ts @@ -0,0 +1,166 @@ +import { describe, expect, it } from "vitest"; +import type { FeedItem } from "../../domain.ts"; +import { + buildRankingPrompt, + itemKey, + mergeRankedKeysOrder, + mergeRankedOrder, + parseRankedIndices, +} from "../rank.ts"; +import { FakeLlmRanker } from "./fakeLlmRanker.ts"; + +function item(overrides: Partial): FeedItem { + return { + source: "hackernews", + externalId: "1", + title: "title", + url: "https://example.com", + sourceRank: 1, + metadata: {}, + ...overrides, + }; +} + +describe("buildRankingPrompt", () => { + it("numbers items by index and includes title, summary, and interests", () => { + const items = [ + item({ title: "Rust async runtime", summary: "a new executor" }), + item({ title: "No summary item" }), + ]; + const prompt = buildRankingPrompt(items, "rust, distributed systems"); + expect(prompt).toContain("0: Rust async runtime — a new executor"); + expect(prompt).toContain("1: No summary item"); + expect(prompt).toContain("Reader interests: rust, distributed systems"); + }); + + it("truncates oversized interests text", () => { + const prompt = buildRankingPrompt([], "x".repeat(1000)); + const line = prompt.split("\n").find((l) => l.startsWith("Reader interests:"))!; + expect(line.length).toBeLessThan(320); + }); +}); + +describe("parseRankedIndices", () => { + it("parses a well-formed JSON array", () => { + expect(parseRankedIndices("[2, 0, 1]", 3)).toEqual([2, 0, 1]); + }); + + it("tolerates surrounding prose/markdown around the array", () => { + expect(parseRankedIndices("Sure! ```json\n[1, 0]\n```", 2)).toEqual([ + 1, 0, + ]); + }); + + it("drops out-of-range and duplicate indices", () => { + expect(parseRankedIndices("[1, 1, 5, -1, 0]", 2)).toEqual([1, 0]); + }); + + it("returns [] for garbage output instead of throwing", () => { + expect(parseRankedIndices("not even close to json", 3)).toEqual([]); + expect(parseRankedIndices("", 3)).toEqual([]); + }); +}); + +describe("mergeRankedOrder", () => { + const items = [item({ externalId: "a" }), item({ externalId: "b" }), item({ externalId: "c" })]; + + it("places ranked items first in model order, then appends the rest", () => { + const merged = mergeRankedOrder(items, [2, 0]); + expect(merged.map((i) => i.externalId)).toEqual(["c", "a", "b"]); + }); + + it("passes through original order when ranking is empty", () => { + expect(mergeRankedOrder(items, []).map((i) => i.externalId)).toEqual([ + "a", + "b", + "c", + ]); + }); + + it("ignores indices it doesn't recognize without throwing", () => { + const merged = mergeRankedOrder(items, [99, 1]); + expect(merged.map((i) => i.externalId)).toEqual(["b", "a", "c"]); + }); + + it("preserves a non-chronological base order for the remainder, e.g. a similarity-ranked pool", () => { + const shuffled = [ + item({ externalId: "c" }), + item({ externalId: "a" }), + item({ externalId: "b" }), + ]; + const merged = mergeRankedOrder(shuffled, [2]); + expect(merged.map((i) => i.externalId)).toEqual(["b", "c", "a"]); + }); +}); + +describe("mergeRankedKeysOrder", () => { + const items = [ + item({ source: "hackernews", externalId: "a" }), + item({ source: "github", externalId: "b" }), + item({ source: "hackernews", externalId: "c" }), + ]; + + it("places ranked items first by cached key order, then appends the rest", () => { + const merged = mergeRankedKeysOrder(items, [ + itemKey(items[2]!), + itemKey(items[0]!), + ]); + expect(merged.map((i) => i.externalId)).toEqual(["c", "a", "b"]); + }); + + it("tolerates a cached key for an item that no longer exists", () => { + const merged = mergeRankedKeysOrder(items, [ + "hackernews missing-id", + itemKey(items[1]!), + ]); + expect(merged.map((i) => i.externalId)).toEqual(["b", "a", "c"]); + }); + + it("tolerates duplicate cached keys", () => { + const key = itemKey(items[1]!); + const merged = mergeRankedKeysOrder(items, [key, key]); + expect(merged.map((i) => i.externalId)).toEqual(["b", "a", "c"]); + }); + + it("passes through original order when there is no cached ranking", () => { + expect(mergeRankedKeysOrder(items, []).map((i) => i.externalId)).toEqual([ + "a", + "b", + "c", + ]); + }); + + it("preserves a non-chronological base order for the remainder, e.g. a similarity-ranked pool", () => { + const shuffled = [ + item({ source: "hackernews", externalId: "c" }), + item({ source: "hackernews", externalId: "a" }), + item({ source: "github", externalId: "b" }), + ]; + const merged = mergeRankedKeysOrder(shuffled, [ + itemKey(shuffled[2]!), + ]); + expect(merged.map((i) => i.externalId)).toEqual(["b", "c", "a"]); + }); +}); + +describe("FakeLlmRanker + degrade pattern", () => { + const items = [item({ externalId: "a" }), item({ externalId: "b" })]; + + it("a successful rank reorders items", async () => { + const ranker = new FakeLlmRanker(() => [1, 0]); + const ranked = await ranker.rank(items, "anything"); + expect(mergeRankedOrder(items, ranked).map((i) => i.externalId)).toEqual([ + "b", + "a", + ]); + }); + + it("a thrown error is the caller's signal to degrade to chronological order", async () => { + const ranker = new FakeLlmRanker(() => { + throw new Error("model unavailable"); + }); + await expect(ranker.rank(items, "anything")).rejects.toThrow( + "model unavailable", + ); + }); +}); diff --git a/core/personalize/test/similarity.spec.ts b/core/personalize/test/similarity.spec.ts new file mode 100644 index 0000000..1357b3c --- /dev/null +++ b/core/personalize/test/similarity.spec.ts @@ -0,0 +1,80 @@ +import { describe, expect, it } from "vitest"; +import type { FeedItem } from "../../domain.ts"; +import { itemKey } from "../rank.ts"; +import { cosineSimilarity, rankBySimilarity } from "../similarity.ts"; + +function item(overrides: Partial): FeedItem { + return { + source: "hackernews", + externalId: "1", + title: "title", + url: "https://example.com", + sourceRank: 1, + metadata: {}, + ...overrides, + }; +} + +describe("cosineSimilarity", () => { + it("is 1 for identical vectors", () => { + expect(cosineSimilarity([1, 2, 3], [1, 2, 3])).toBeCloseTo(1); + }); + + it("is 0 for orthogonal vectors", () => { + expect(cosineSimilarity([1, 0], [0, 1])).toBeCloseTo(0); + }); + + it("is -1 for opposite vectors", () => { + expect(cosineSimilarity([1, 2], [-1, -2])).toBeCloseTo(-1); + }); + + it("is 0 rather than NaN for a zero vector", () => { + expect(cosineSimilarity([0, 0], [1, 2])).toBe(0); + }); +}); + +describe("rankBySimilarity", () => { + it("sorts items strictly by descending similarity when all have vectors", () => { + const a = item({ externalId: "a" }); + const b = item({ externalId: "b" }); + const c = item({ externalId: "c" }); + const interests = [1, 0]; + const embeddings = new Map([ + [itemKey(a), [0, 1]], // orthogonal -> 0 + [itemKey(b), [1, 0]], // identical -> 1 + [itemKey(c), [0.7, 0.3]], // partial match + ]); + const ranked = rankBySimilarity([a, b, c], embeddings, interests); + expect(ranked.map((i) => i.externalId)).toEqual(["b", "c", "a"]); + }); + + it("sinks un-embedded items to the end, preserving their relative order", () => { + const embedded = item({ externalId: "embedded" }); + const first = item({ externalId: "first" }); + const second = item({ externalId: "second" }); + const embeddings = new Map([[itemKey(embedded), [1, 0]]]); + const ranked = rankBySimilarity( + [first, embedded, second], + embeddings, + [1, 0], + ); + // embedded item sorts first; the two un-embedded items keep their + // original relative order ("first" before "second") rather than being + // reshuffled. + expect(ranked.map((i) => i.externalId)).toEqual([ + "embedded", + "first", + "second", + ]); + }); + + it("degrades to a no-op when no item has a stored embedding", () => { + const items = [ + item({ externalId: "a" }), + item({ externalId: "b" }), + item({ externalId: "c" }), + ]; + const ranked = rankBySimilarity(items, new Map(), [1, 0]); + expect(ranked.map((i) => i.externalId)).toEqual(["a", "b", "c"]); + }); +}); diff --git a/core/ports.ts b/core/ports.ts index 45f40dc..c78802f 100644 --- a/core/ports.ts +++ b/core/ports.ts @@ -8,10 +8,19 @@ import type { FeedItem, SyncState } from "./domain.ts"; export interface FeedRepository { + /** + * Upserts `items` for `source`. `embeddings` is keyed by `itemKey()` + * (see core/personalize/rank.ts) and may be empty or a partial subset of + * `items` — a key's absence means "leave that item's stored embedding + * untouched" (preserve whatever is already on the row), not "clear it". + * Embedding generation is best-effort at the call site, so this must + * accept an empty map without complaint. + */ saveSnapshot( source: string, fetchedAtIso: string, items: FeedItem[], + embeddings: Map, ): Promise; recordFailure( source: string, @@ -34,4 +43,53 @@ export interface FeedRepository { * unbounded. Returns the number of rows deleted. */ pruneOldItems(maxPerSource: number): Promise; + /** + * Of the given `externalIds` for `source`, returns the subset that + * already have a stored embedding — used at ingestion time to embed only + * items that don't have one yet, instead of re-embedding the whole batch + * every refresh cycle. + */ + listEmbeddedKeys( + source: string, + externalIds: string[], + ): Promise>; + /** + * Like `listFeedItems`, but for the /api/personalize candidate pool: + * always starts at offset 0, ignores search, and also returns each + * item's stored embedding (keyed by `itemKey()`) alongside the items + * themselves in one round trip. An item with no stored embedding yet is + * simply absent from the map — callers must treat that as "no vector", + * not an error. + */ + listFeedItemsForRanking( + limit: number, + source: string, + sources: string[], + ): Promise<{ items: FeedItem[]; embeddings: Map }>; +} + +/** + * Ranks `items` by relevance to a free-text `interests` description. + * Returns a best-effort ordering of 0-based indices into `items`, most + * relevant first — FeedItem has no stable numeric id, so position in the + * input array is the only identifier the ranker needs. The result may be a + * subset (the caller appends any indices the ranker omitted, in their + * original order) and may be empty if ranking failed entirely; it must + * never throw for a malformed model response, only for genuine + * transport/availability failures. + */ +export interface LlmRanker { + rank(items: FeedItem[], interests: string): Promise; +} + +/** + * Embeds free-text into vectors for similarity ranking. One vector per + * input string, same order as `texts`, on success. Must throw only for + * genuine transport/availability failures (never return a partial or + * malformed result silently) — callers treat a thrown error as "skip + * embedding for this batch/request", the same resilience posture as + * LlmRanker. + */ +export interface Embedder { + embed(texts: string[]): Promise; } diff --git a/core/render.ts b/core/render.ts index b0ca41d..7af1452 100644 --- a/core/render.ts +++ b/core/render.ts @@ -151,8 +151,8 @@ export function renderIndexPage(data: PageData): string { - - + +