Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions core/personalize/rank.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Pure helpers for the "For You" LLM ranking feature. No I/O — adapters
// (e.g. platforms/cloudflare/src/llmRanker.ts) own the actual model call and
// use these to build the prompt and interpret the response.

import type { FeedItem } from "../domain.ts";

export const MAX_INTERESTS_LENGTH = 300;
const MAX_ITEM_TEXT_LENGTH = 160;

export function buildRankingPrompt(
items: FeedItem[],
interests: string,
): string {
const lines = items.map((item, i) => {
const title = item.title.trim().slice(0, MAX_ITEM_TEXT_LENGTH);
const summary = (item.summary ?? "").trim().slice(0, MAX_ITEM_TEXT_LENGTH);
return `${i}: ${title}${summary ? ` — ${summary}` : ""}`;
});
return [
"You are ranking a list of feed items by relevance to a reader's stated interests.",
`Reader interests: ${interests.trim().slice(0, MAX_INTERESTS_LENGTH)}`,
"Items (index: title — summary):",
...lines,
"",
"Respond with ONLY a JSON array of the item indices above, ordered from most to least relevant to the interests. Include every index exactly once. No other text, no markdown.",
].join("\n");
}

/**
* Defensively extracts a ranked index list from a raw model response.
* Never throws — a malformed or empty response yields []. Indices outside
* [0, itemCount) or repeated are dropped (first occurrence wins).
*/
export function parseRankedIndices(raw: string, itemCount: number): number[] {
const match = /\[[\s\S]*\]/.exec(raw);
if (!match) return [];

let parsed: unknown;
try {
parsed = JSON.parse(match[0]);
} catch {
return [];
}
if (!Array.isArray(parsed)) return [];

const seen = new Set<number>();
const out: number[] = [];
for (const value of parsed) {
const index = typeof value === "number" ? value : Number(value);
if (
!Number.isInteger(index) ||
index < 0 ||
index >= itemCount ||
seen.has(index)
) {
continue;
}
seen.add(index);
out.push(index);
}
return out;
}

/**
* Places ranked items first (in model order), then appends any remaining
* items in whatever order `items` arrived in — guarantees a full, valid
* list even when the ranker returns a partial or empty result. Callers may
* pass any base order, not just chronological — e.g. handlePersonalize
* feeds this similarity-ranked input (core/personalize/similarity.ts) when
* running the LLM as a polish pass over a pre-filtered candidate pool, and
* the "remainder" preserves that similarity order rather than reverting to
* chronological.
*/
export function mergeRankedOrder(
items: FeedItem[],
rankedIndices: number[],
): FeedItem[] {
const used = new Set<number>();
const out: FeedItem[] = [];
for (const index of rankedIndices) {
const item = items[index];
if (!item || used.has(index)) continue;
used.add(index);
out.push(item);
}
for (let i = 0; i < items.length; i++) {
if (!used.has(i)) out.push(items[i]!);
}
return out;
}

/** Stable identifier for a FeedItem — the domain type has no numeric id,
* so (source, externalId) is the natural key for referencing an item
* across requests (e.g. in a cached ranking). */
export function itemKey(item: FeedItem): string {
return `${item.source} ${item.externalId}`;
}

/**
* Projects a previously-computed ranked key order (from a cached ranking,
* possibly over a different/smaller pool) onto a freshly-fetched item
* list: ranked items appear first, in cached order; anything not in
* rankedKeys — new items, or items the cache doesn't cover — keeps its
* relative order from `items` at the end (which, like mergeRankedOrder,
* need not be chronological — see that function's comment). Unlike
* mergeRankedOrder, this tolerates the two lists having different lengths
* or contents, since `items` is re-fetched live while rankedKeys may be
* stale or partial.
*/
export function mergeRankedKeysOrder(
items: FeedItem[],
rankedKeys: string[],
): FeedItem[] {
const byKey = new Map(items.map((item) => [itemKey(item), item]));
const used = new Set<string>();
const out: FeedItem[] = [];
for (const key of rankedKeys) {
const item = byKey.get(key);
if (!item || used.has(key)) continue;
used.add(key);
out.push(item);
}
for (const item of items) {
if (!used.has(itemKey(item))) out.push(item);
}
return out;
}
57 changes: 57 additions & 0 deletions core/personalize/similarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Pure helpers for ranking by embedding similarity — the retrieval half of
// the "For You" feature's retrieve-then-rerank pipeline. No I/O; adapters
// (e.g. platforms/cloudflare/src/embedder.ts) own the actual model call and
// platforms/cloudflare/src/index.ts's handlePersonalize wires this together
// with the existing LLM polish step (core/personalize/rank.ts).

import type { FeedItem } from "../domain.ts";
import { itemKey } from "./rank.ts";

export function cosineSimilarity(a: number[], b: number[]): number {
let dot = 0;
let normA = 0;
let normB = 0;
const length = Math.min(a.length, b.length);
for (let i = 0; i < length; i++) {
dot += a[i]! * b[i]!;
normA += a[i]! * a[i]!;
normB += b[i]! * b[i]!;
}
if (normA === 0 || normB === 0) return 0;
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}

/**
* Sorts `items` by cosine similarity to `interestsVector`, descending.
* Items with no entry in `embeddings` (not yet embedded at ingestion time,
* or embedding generation failed that cycle) sink to the end, keeping
* their relative order from `items` — a stable partition, not a random
* placement, so an all-unembedded pool degrades to a no-op rather than
* reshuffling. mergeRankedOrder/mergeRankedKeysOrder in rank.ts then treat
* this function's output as the new "base order" for the LLM polish step.
*/
export function rankBySimilarity(
items: FeedItem[],
embeddings: Map<string, number[]>,
interestsVector: number[],
): FeedItem[] {
const scored: { item: FeedItem; score: number; index: number }[] = [];
const unscored: FeedItem[] = [];
items.forEach((item, index) => {
const vector = embeddings.get(itemKey(item));
if (vector) {
scored.push({
item,
score: cosineSimilarity(vector, interestsVector),
index,
});
} else {
unscored.push(item);
}
});
scored.sort((a, b) => {
if (b.score !== a.score) return b.score - a.score;
return a.index - b.index;
});
return [...scored.map((entry) => entry.item), ...unscored];
}
15 changes: 15 additions & 0 deletions core/personalize/test/fakeEmbedder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import type { Embedder } from "../../ports.ts";

/** In-memory Embedder for tests — no network calls. `script` decides the
* returned vectors (or throws, to simulate a transport/availability
* failure), same pattern as FakeLlmRanker. */
export class FakeEmbedder implements Embedder {
readonly calls: string[][] = [];

constructor(private readonly script: (texts: string[]) => number[][]) {}

async embed(texts: string[]): Promise<number[][]> {
this.calls.push(texts);
return this.script(texts);
}
}
17 changes: 17 additions & 0 deletions core/personalize/test/fakeLlmRanker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import type { FeedItem } from "../../domain.ts";
import type { LlmRanker } from "../../ports.ts";

/** In-memory LlmRanker for tests — no network calls. `script` decides the
* returned ranking (or throws, to simulate a transport/availability failure). */
export class FakeLlmRanker implements LlmRanker {
constructor(
private readonly script: (
items: FeedItem[],
interests: string,
) => number[],
) {}

async rank(items: FeedItem[], interests: string): Promise<number[]> {
return this.script(items, interests);
}
}
166 changes: 166 additions & 0 deletions core/personalize/test/rank.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import { describe, expect, it } from "vitest";
import type { FeedItem } from "../../domain.ts";
import {
buildRankingPrompt,
itemKey,
mergeRankedKeysOrder,
mergeRankedOrder,
parseRankedIndices,
} from "../rank.ts";
import { FakeLlmRanker } from "./fakeLlmRanker.ts";

function item(overrides: Partial<FeedItem>): FeedItem {
return {
source: "hackernews",
externalId: "1",
title: "title",
url: "https://example.com",
sourceRank: 1,
metadata: {},
...overrides,
};
}

describe("buildRankingPrompt", () => {
it("numbers items by index and includes title, summary, and interests", () => {
const items = [
item({ title: "Rust async runtime", summary: "a new executor" }),
item({ title: "No summary item" }),
];
const prompt = buildRankingPrompt(items, "rust, distributed systems");
expect(prompt).toContain("0: Rust async runtime — a new executor");
expect(prompt).toContain("1: No summary item");
expect(prompt).toContain("Reader interests: rust, distributed systems");
});

it("truncates oversized interests text", () => {
const prompt = buildRankingPrompt([], "x".repeat(1000));
const line = prompt.split("\n").find((l) => l.startsWith("Reader interests:"))!;
expect(line.length).toBeLessThan(320);
});
});

describe("parseRankedIndices", () => {
it("parses a well-formed JSON array", () => {
expect(parseRankedIndices("[2, 0, 1]", 3)).toEqual([2, 0, 1]);
});

it("tolerates surrounding prose/markdown around the array", () => {
expect(parseRankedIndices("Sure! ```json\n[1, 0]\n```", 2)).toEqual([
1, 0,
]);
});

it("drops out-of-range and duplicate indices", () => {
expect(parseRankedIndices("[1, 1, 5, -1, 0]", 2)).toEqual([1, 0]);
});

it("returns [] for garbage output instead of throwing", () => {
expect(parseRankedIndices("not even close to json", 3)).toEqual([]);
expect(parseRankedIndices("", 3)).toEqual([]);
});
});

describe("mergeRankedOrder", () => {
const items = [item({ externalId: "a" }), item({ externalId: "b" }), item({ externalId: "c" })];

it("places ranked items first in model order, then appends the rest", () => {
const merged = mergeRankedOrder(items, [2, 0]);
expect(merged.map((i) => i.externalId)).toEqual(["c", "a", "b"]);
});

it("passes through original order when ranking is empty", () => {
expect(mergeRankedOrder(items, []).map((i) => i.externalId)).toEqual([
"a",
"b",
"c",
]);
});

it("ignores indices it doesn't recognize without throwing", () => {
const merged = mergeRankedOrder(items, [99, 1]);
expect(merged.map((i) => i.externalId)).toEqual(["b", "a", "c"]);
});

it("preserves a non-chronological base order for the remainder, e.g. a similarity-ranked pool", () => {
const shuffled = [
item({ externalId: "c" }),
item({ externalId: "a" }),
item({ externalId: "b" }),
];
const merged = mergeRankedOrder(shuffled, [2]);
expect(merged.map((i) => i.externalId)).toEqual(["b", "c", "a"]);
});
});

describe("mergeRankedKeysOrder", () => {
const items = [
item({ source: "hackernews", externalId: "a" }),
item({ source: "github", externalId: "b" }),
item({ source: "hackernews", externalId: "c" }),
];

it("places ranked items first by cached key order, then appends the rest", () => {
const merged = mergeRankedKeysOrder(items, [
itemKey(items[2]!),
itemKey(items[0]!),
]);
expect(merged.map((i) => i.externalId)).toEqual(["c", "a", "b"]);
});

it("tolerates a cached key for an item that no longer exists", () => {
const merged = mergeRankedKeysOrder(items, [
"hackernews missing-id",
itemKey(items[1]!),
]);
expect(merged.map((i) => i.externalId)).toEqual(["b", "a", "c"]);
});

it("tolerates duplicate cached keys", () => {
const key = itemKey(items[1]!);
const merged = mergeRankedKeysOrder(items, [key, key]);
expect(merged.map((i) => i.externalId)).toEqual(["b", "a", "c"]);
});

it("passes through original order when there is no cached ranking", () => {
expect(mergeRankedKeysOrder(items, []).map((i) => i.externalId)).toEqual([
"a",
"b",
"c",
]);
});

it("preserves a non-chronological base order for the remainder, e.g. a similarity-ranked pool", () => {
const shuffled = [
item({ source: "hackernews", externalId: "c" }),
item({ source: "hackernews", externalId: "a" }),
item({ source: "github", externalId: "b" }),
];
const merged = mergeRankedKeysOrder(shuffled, [
itemKey(shuffled[2]!),
]);
expect(merged.map((i) => i.externalId)).toEqual(["b", "c", "a"]);
});
});

describe("FakeLlmRanker + degrade pattern", () => {
const items = [item({ externalId: "a" }), item({ externalId: "b" })];

it("a successful rank reorders items", async () => {
const ranker = new FakeLlmRanker(() => [1, 0]);
const ranked = await ranker.rank(items, "anything");
expect(mergeRankedOrder(items, ranked).map((i) => i.externalId)).toEqual([
"b",
"a",
]);
});

it("a thrown error is the caller's signal to degrade to chronological order", async () => {
const ranker = new FakeLlmRanker(() => {
throw new Error("model unavailable");
});
await expect(ranker.rank(items, "anything")).rejects.toThrow(
"model unavailable",
);
});
});
Loading