From d1e9ea9f2dc99b3b15c427c06250315ff35ebe32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Tue, 19 May 2026 10:18:55 +0200 Subject: [PATCH 1/5] wip --- apps/api/src/controllers/track.controller.ts | 35 +- apps/api/src/utils/ids.ts | 51 +- apps/worker/src/boot-cron.ts | 5 + apps/worker/src/jobs/cron.session-reaper.ts | 219 +++++++ apps/worker/src/jobs/cron.ts | 4 + .../src/jobs/events.create-session-end.ts | 89 ++- apps/worker/src/jobs/events.incoming-event.ts | 114 ++-- .../src/jobs/events.incoming-events.test.ts | 577 +++++++----------- apps/worker/src/metrics.ts | 187 +++++- apps/worker/src/utils/session-handler.ts | 121 ++-- packages/db/scripts/drain-old-session-jobs.ts | 79 +++ packages/db/scripts/migrate-sessions.ts | 155 +++++ packages/db/src/buffers/index.ts | 5 + .../db/src/buffers/session-buffer.test.ts | 235 +++++-- packages/db/src/buffers/session-buffer.ts | 553 ++++++++++------- packages/db/src/services/event.service.ts | 16 +- packages/db/src/services/session.service.ts | 14 +- packages/queue/src/queues.ts | 7 +- 18 files changed, 1580 insertions(+), 886 deletions(-) create mode 100644 apps/worker/src/jobs/cron.session-reaper.ts create mode 100644 packages/db/scripts/drain-old-session-jobs.ts create mode 100644 packages/db/scripts/migrate-sessions.ts diff --git a/apps/api/src/controllers/track.controller.ts b/apps/api/src/controllers/track.controller.ts index 2cda17f2a..1c0c743c2 100644 --- a/apps/api/src/controllers/track.controller.ts +++ b/apps/api/src/controllers/track.controller.ts @@ -5,6 +5,7 @@ import { getSalts, groupBuffer, replayBuffer, + sessionBuffer, upsertProfile, } from '@openpanel/db'; import { type GeoLocation, getGeoLocation } from '@openpanel/geo'; @@ -14,7 +15,6 @@ import { produceIncomingEvent, shouldUseKafka, } from '@openpanel/queue'; -import { getRedisCache } from '@openpanel/redis'; import type { IAssignGroupPayload, IDecrementPayload, @@ -486,32 +486,29 @@ export async function fetchDeviceId( }); try { - const multi = getRedisCache().multi(); - multi.hget( - `bull:sessions:sessionEnd:${projectId}:${currentDeviceId}`, - 'data' - ); - multi.hget( - `bull:sessions:sessionEnd:${projectId}:${previousDeviceId}`, - 'data' - ); - const res = await multi.exec(); - if (res?.[0]?.[1]) { - const data = JSON.parse(res?.[0]?.[1] as string); - const sessionId = data.payload.sessionId; + const [current, previous] = await Promise.all([ + sessionBuffer.getExistingSession({ + projectId, + deviceId: currentDeviceId, + }), + sessionBuffer.getExistingSession({ + projectId, + deviceId: previousDeviceId, + }), + ]); + + if (current) { return reply.status(200).send({ deviceId: currentDeviceId, - sessionId, + sessionId: current.id, message: 'current session exists for this device id', }); } - if (res?.[1]?.[1]) { - const data = JSON.parse(res?.[1]?.[1] as string); - const sessionId = data.payload.sessionId; + if (previous) { return reply.status(200).send({ deviceId: previousDeviceId, - sessionId, + sessionId: previous.id, message: 'previous session exists for this device id', }); } diff --git a/apps/api/src/utils/ids.ts b/apps/api/src/utils/ids.ts index ec0cfc993..7ea035519 100644 --- a/apps/api/src/utils/ids.ts +++ b/apps/api/src/utils/ids.ts @@ -1,8 +1,6 @@ import crypto from 'node:crypto'; import { generateDeviceId } from '@openpanel/common/server'; -import { getSafeJson } from '@openpanel/json'; -import type { EventsQueuePayloadCreateSessionEnd } from '@openpanel/queue'; -import { getRedisCache } from '@openpanel/redis'; +import { sessionBuffer } from '@openpanel/db'; export async function getDeviceId({ projectId, @@ -66,40 +64,25 @@ async function getInfoFromSession({ previousDeviceId: string; }): Promise { try { - const multi = getRedisCache().multi(); - multi.hget( - `bull:sessions:sessionEnd:${projectId}:${currentDeviceId}`, - 'data' - ); - multi.hget( - `bull:sessions:sessionEnd:${projectId}:${previousDeviceId}`, - 'data' - ); - const res = await multi.exec(); - if (res?.[0]?.[1]) { - const data = getSafeJson( - (res?.[0]?.[1] as string) ?? '' - ); - if (data) { - return { - deviceId: currentDeviceId, - sessionId: data.payload.sessionId, - }; - } + const [current, previous] = await Promise.all([ + sessionBuffer.getExistingSession({ + projectId, + deviceId: currentDeviceId, + }), + sessionBuffer.getExistingSession({ + projectId, + deviceId: previousDeviceId, + }), + ]); + + if (current) { + return { deviceId: currentDeviceId, sessionId: current.id }; } - if (res?.[1]?.[1]) { - const data = getSafeJson( - (res?.[1]?.[1] as string) ?? '' - ); - if (data) { - return { - deviceId: previousDeviceId, - sessionId: data.payload.sessionId, - }; - } + if (previous) { + return { deviceId: previousDeviceId, sessionId: previous.id }; } } catch (error) { - console.error('Error getting session end GET /track/device-id', error); + console.error('Error getting session GET /track/device-id', error); } return { diff --git a/apps/worker/src/boot-cron.ts b/apps/worker/src/boot-cron.ts index d2458076e..fc9587386 100644 --- a/apps/worker/src/boot-cron.ts +++ b/apps/worker/src/boot-cron.ts @@ -93,6 +93,11 @@ export async function bootCron() { type: 'cohortRefresh', pattern: '*/30 * * * *', }, + { + name: 'sessionReaper', + type: 'sessionReaper', + pattern: 1000 * 60 * 5, // every 5 minutes + }, ]; if (process.env.SELF_HOSTED && process.env.NODE_ENV === 'production') { diff --git a/apps/worker/src/jobs/cron.session-reaper.ts b/apps/worker/src/jobs/cron.session-reaper.ts new file mode 100644 index 000000000..2cf9c4945 --- /dev/null +++ b/apps/worker/src/jobs/cron.session-reaper.ts @@ -0,0 +1,219 @@ +import { SESSION_TIMEOUT_MS, sessionBuffer } from '@openpanel/db'; +import { getRedisCache } from '@openpanel/redis'; +import { logger as baseLogger } from '@/utils/logger'; +import { sessionEndsEnqueued, sessionsReaped } from '@/metrics'; +import { enqueueSessionEndV2 } from '@/utils/session-handler'; + +const logger = baseLogger.child({ job: 'session-reaper' }); + +const PROJECTS_SET_KEY = 'session:projects'; +const REAPER_BATCH_SIZE = Number.parseInt( + process.env.SESSION_REAPER_BATCH_SIZE || '5000', + 10 +); +// 24h deadman: close sessions whose host project has gone silent in event-time +// AND whose last wall-clock-received is older than this. Bounds Redis memory +// for projects that fall idle. +const WALLCLOCK_DEADMAN_MS = Number.parseInt( + process.env.SESSION_REAPER_WALLCLOCK_DEADMAN_MS || + String(1000 * 60 * 60 * 24), + 10 +); +const LOCK_TTL_SECONDS = 60; + +const activeSetKey = (projectId: string) => `session:active:${projectId}`; +const wallclockSetKey = (projectId: string) => + `session:wallclock:${projectId}`; +const sessionKey = (projectId: string, deviceId: string) => + `session:${projectId}:${deviceId}`; +const hwmKey = (projectId: string) => `session:hwm:${projectId}`; +const lockKey = (projectId: string) => `session:reaper:lock:${projectId}`; + +/** + * Cron: scan every project with active sessions, close any whose last event + * is older than the timeout in event-time (HWM-based) or older than the + * 24h wall-clock deadman. + * + * Idempotent — uses BullMQ jobId-dedup via enqueueSessionEndV2. + */ +export async function sessionReaperCronJob() { + if (process.env.SESSION_REAPER === '0') { + return; + } + + const redis = getRedisCache(); + const projectIds = await redis.smembers(PROJECTS_SET_KEY); + + if (projectIds.length === 0) { + return; + } + + logger.debug({ projectCount: projectIds.length }, 'Reaper tick'); + + let totalReaped = 0; + let totalErrors = 0; + + for (const projectId of projectIds) { + try { + const reaped = await reapProject(projectId); + totalReaped += reaped; + } catch (error) { + totalErrors++; + logger.error({ err: error, projectId }, 'Failed to reap project'); + } + } + + if (totalReaped > 0 || totalErrors > 0) { + logger.info( + { reaped: totalReaped, errors: totalErrors, projects: projectIds.length }, + 'Reaper tick complete', + ); + } +} + +async function reapProject(projectId: string): Promise { + const redis = getRedisCache(); + + // Per-project advisory lock keeps multiple worker pods from reaping the + // same project simultaneously. Set NX with a short TTL; the lock is best + // effort — if it expires mid-tick, BullMQ jobId-dedup is the real guard. + const locked = await redis.set( + lockKey(projectId), + '1', + 'EX', + LOCK_TTL_SECONDS, + 'NX' + ); + if (locked === null) { + return 0; + } + + try { + const hwmRaw = await redis.get(hwmKey(projectId)); + const now = Date.now(); + let reaped = 0; + + // 1. Event-time reap: close sessions whose last event is more than + // SESSION_TIMEOUT_MS behind the project's high-water mark. + if (hwmRaw) { + const hwm = Number.parseInt(hwmRaw, 10); + if (Number.isFinite(hwm)) { + const threshold = hwm - SESSION_TIMEOUT_MS; + const candidates = await redis.zrangebyscore( + activeSetKey(projectId), + 0, + threshold, + 'LIMIT', + 0, + REAPER_BATCH_SIZE + ); + for (const deviceId of candidates) { + if (await closeSession(projectId, deviceId, 'event-time')) { + reaped++; + } + } + } + } + + // 2. Wall-clock deadman: catches sessions in projects whose HWM never + // advances (low/zero traffic). Bounds memory. + const deadmanCutoff = now - WALLCLOCK_DEADMAN_MS; + const stragglers = await redis.zrangebyscore( + wallclockSetKey(projectId), + 0, + deadmanCutoff, + 'LIMIT', + 0, + REAPER_BATCH_SIZE + ); + for (const deviceId of stragglers) { + if (await closeSession(projectId, deviceId, 'deadman')) { + reaped++; + } + } + + // 3. House-keeping: if the project has no active sessions anymore, + // remove it from the projects set so the reaper stops iterating it. + const remaining = await redis.zcard(activeSetKey(projectId)); + if (remaining === 0) { + await redis.srem(PROJECTS_SET_KEY, projectId); + } + + return reaped; + } finally { + await redis.del(lockKey(projectId)); + } +} + +async function closeSession( + projectId: string, + deviceId: string, + reason: 'event-time' | 'deadman' +): Promise { + const session = await sessionBuffer.getExistingSession({ + projectId, + deviceId, + }); + + if (!session) { + // Already cleaned up (e.g., session_end ran from another path). Drop + // from the sorted sets to keep them tidy. + const redis = getRedisCache(); + const multi = redis.multi(); + multi.zrem(activeSetKey(projectId), deviceId); + multi.zrem(wallclockSetKey(projectId), deviceId); + await multi.exec(); + return false; + } + + try { + await enqueueSessionEndV2({ + payload: { + projectId: session.project_id, + deviceId: session.device_id, + sessionId: session.id, + profileId: session.profile_id ?? '', + name: 'session_end', + properties: {}, + groups: session.groups ?? [], + createdAt: new Date(session.ended_at), + duration: session.duration ?? 0, + sdkName: '', + sdkVersion: '', + city: session.city ?? '', + country: session.country ?? '', + region: session.region ?? '', + longitude: session.longitude ?? undefined, + latitude: session.latitude ?? undefined, + path: session.exit_path ?? '', + origin: session.exit_origin ?? '', + referrer: session.referrer ?? '', + referrerName: session.referrer_name ?? '', + referrerType: session.referrer_type ?? '', + os: session.os ?? '', + osVersion: session.os_version ?? '', + browser: session.browser ?? '', + browserVersion: session.browser_version ?? '', + device: session.device ?? '', + brand: session.brand ?? '', + model: session.model ?? '', + }, + closedSession: session, + }); + + sessionsReaped.inc({ reason }); + sessionEndsEnqueued.inc({ source: 'reaper' }); + + logger.debug( + { sessionId: session.id, projectId, deviceId, reason }, + 'Enqueued session_end (reaped)', + ); + return true; + } catch (error) { + logger.error( + { err: error, sessionId: session.id, projectId, deviceId, reason }, + 'Failed to enqueue session_end during reap', + ); + return false; + } +} diff --git a/apps/worker/src/jobs/cron.ts b/apps/worker/src/jobs/cron.ts index 305c30471..214ce0bdf 100644 --- a/apps/worker/src/jobs/cron.ts +++ b/apps/worker/src/jobs/cron.ts @@ -9,6 +9,7 @@ import { gscSyncAllJob } from './gsc'; import { onboardingJob } from './cron.onboarding'; import { ping } from './cron.ping'; import { salt } from './cron.salt'; +import { sessionReaperCronJob } from './cron.session-reaper'; import { insightsDailyJob } from './insights'; export async function cronJob(job: Job) { @@ -52,5 +53,8 @@ export async function cronJob(job: Job) { case 'cohortRefresh': { return await cohortRefreshCronJob(); } + case 'sessionReaper': { + return await sessionReaperCronJob(); + } } } diff --git a/apps/worker/src/jobs/events.create-session-end.ts b/apps/worker/src/jobs/events.create-session-end.ts index 32e901ad6..972da066e 100644 --- a/apps/worker/src/jobs/events.create-session-end.ts +++ b/apps/worker/src/jobs/events.create-session-end.ts @@ -16,45 +16,40 @@ import { transformSessionToEvent, } from '@openpanel/db'; import type { EventsQueuePayloadCreateSessionEnd } from '@openpanel/queue'; +import { getRedisCache } from '@openpanel/redis'; import type { Job } from 'bullmq'; import { logger as baseLogger } from '@/utils/logger'; +import { + sessionDurationOnClose, + sessionEndsEmitted, + sessionEndsSkipped, + sessionEventsOnClose, +} from '@/metrics'; const MAX_SESSION_EVENTS = 500; +const SESSION_END_CLAIM_TTL = 60 * 60 * 2; // 2h, well past any plausible retry window -// Grabs session_start and the last occured event async function getSessionEvents({ - projectId, - sessionId, + session, startAt, endAt, }: { - projectId: string; - sessionId: string; + session: IClickhouseSession; startAt: Date; endAt: Date; }): Promise { const sql = ` SELECT * FROM ${TABLE_NAMES.events} WHERE - session_id = '${sessionId}' - AND project_id = '${projectId}' + session_id = '${session.id}' + AND project_id = '${session.project_id}' AND created_at BETWEEN '${formatClickhouseDate(new Date(startAt.getTime() - 1000))}' AND '${formatClickhouseDate(new Date(endAt.getTime() + 1000))}' ORDER BY created_at DESC LIMIT ${MAX_SESSION_EVENTS}; `; - const [lastScreenView, eventsInDb] = await Promise.all([ - sessionBuffer.getExistingSession({ - sessionId, - }), - getEvents(sql), - ]); - - // sort last inserted first - return [ - lastScreenView ? transformSessionToEvent(lastScreenView) : null, - ...eventsInDb, - ] - .flatMap((event) => (event ? [event] : [])) + const eventsInDb = await getEvents(sql); + + return [transformSessionToEvent(session), ...eventsInDb] .sort( (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime() @@ -73,13 +68,42 @@ export async function createSessionEnd( logger.debug('Processing session end job'); const session = await sessionBuffer.getExistingSession({ - sessionId: payload.sessionId, + projectId: payload.projectId, + deviceId: payload.deviceId, }); if (!session) { - throw new Error('Session not found'); + // Session already cleaned up — likely already ended by a sibling job or + // reaped after Redis TTL. Treat as a no-op rather than a hard error so + // retries don't spam the queue. + sessionEndsSkipped.inc({ reason: 'not_found' }); + logger.warn( + { projectId: payload.projectId, deviceId: payload.deviceId }, + 'Session not found in Redis — skipping session_end', + ); + return null; } + // Idempotency claim: first writer wins. Subsequent retries / reaper + // double-fires are no-ops. + const redis = getRedisCache(); + const claimKey = `session:end:emitted:${session.project_id}:${session.device_id}:${session.id}`; + const claimed = await redis.set(claimKey, '1', 'EX', SESSION_END_CLAIM_TTL, 'NX'); + if (claimed === null) { + sessionEndsSkipped.inc({ reason: 'already_emitted' }); + logger.info( + { sessionId: session.id, projectId: session.project_id }, + 'session_end already emitted, skipping', + ); + return null; + } + + sessionEndsEmitted.inc(); + sessionDurationOnClose.observe(Math.max(0, session.duration ?? 0)); + sessionEventsOnClose.observe( + (session.event_count ?? 0) + (session.screen_view_count ?? 0) + ); + const profileId = session.profile_id || payload.profileId; if ( @@ -96,7 +120,7 @@ export async function createSessionEnd( ) { await profileBackfillBuffer.add({ projectId: payload.projectId, - sessionId: payload.sessionId, + sessionId: session.id, profileId, }); } @@ -105,6 +129,7 @@ export async function createSessionEnd( // Create session end event const { document: sessionEndEvent } = await createEvent({ ...payload, + sessionId: session.id, properties: { ...payload.properties, __bounce: session.is_bounce, @@ -131,6 +156,21 @@ export async function createSessionEnd( ); } + // Clean up Redis state for this session. The session blob, profile index, + // and active/wallclock sorted set entries are all removed atomically. + await sessionBuffer + .cleanup({ + projectId: session.project_id, + deviceId: session.device_id, + profileId: session.profile_id, + }) + .catch((error) => { + logger.error( + { err: error, sessionId: session.id }, + 'Failed to cleanup session state after session_end', + ); + }); + return sessionEndEvent; } @@ -152,8 +192,7 @@ async function handleSessionEndNotifications({ if (hasFunnelRules && isEventCountReasonable) { const events = await getSessionEvents({ - projectId: payload.projectId, - sessionId: payload.sessionId, + session, startAt: convertClickhouseDateToJs(session.created_at), endAt: convertClickhouseDateToJs(session.ended_at), }); diff --git a/apps/worker/src/jobs/events.incoming-event.ts b/apps/worker/src/jobs/events.incoming-event.ts index cf89391fe..0cac12864 100644 --- a/apps/worker/src/jobs/events.incoming-event.ts +++ b/apps/worker/src/jobs/events.incoming-event.ts @@ -11,22 +11,16 @@ import { import type { ILogger } from '@openpanel/logger'; import type { EventsQueuePayloadIncomingEvent } from '@openpanel/queue'; import { anyPass, isEmpty, isNil, mergeDeepRight, omit, reject } from 'ramda'; +import { sessionEndsEnqueued, sessionsStarted } from '@/metrics'; import { logger as baseLogger } from '@/utils/logger'; -import { - createSessionEndJob, - extendSessionEndJob, - getActiveSessionEndJob, -} from '@/utils/session-handler'; +import { enqueueSessionEndV2 } from '@/utils/session-handler'; const GLOBAL_PROPERTIES = ['__path', '__referrer', '__timestamp', '__revenue']; -// This function will merge two objects. -// First it will strip '' and undefined/null from B -// Then it will merge the two objects with a standard ramda merge function +// Strip empty/nullish from B, then deep-merge over A. const merge = (a: Partial, b: Partial): A & B => mergeDeepRight(a, reject(anyPass([isEmpty, isNil]))(b)) as A & B; -/** Check if payload matches project-level event exclude filters */ async function isEventExcludedByProjectFilter( payload: IServiceCreateEventPayload, projectId: string @@ -46,12 +40,11 @@ async function createEventAndNotify( logger: ILogger, projectId: string ) { - // Check project-level event exclude filters const isExcluded = await isEventExcludedByProjectFilter(payload, projectId); if (isExcluded) { logger.info( { event: payload.name, projectId }, - 'Event excluded by project filter', + 'Event excluded by project filter' ); return null; } @@ -61,7 +54,6 @@ async function createEventAndNotify( createEvent(payload), checkNotificationRulesForEvent(payload).catch(() => null), ]); - return event; } @@ -74,10 +66,7 @@ const parseRevenue = (revenue: unknown): number | undefined => { } if (typeof revenue === 'string') { const parsed = Number.parseFloat(revenue); - if (Number.isNaN(parsed)) { - return undefined; - } - return parsed; + return Number.isNaN(parsed) ? undefined : parsed; } return undefined; }; @@ -117,7 +106,6 @@ export async function incomingEvent( ); }; - // this will get the profileId from the alias table if it exists const profileId = body.profileId ? String(body.profileId) : ''; const createdAt = new Date(body.timestamp); const isTimestampFromThePast = body.isTimestampFromThePast; @@ -169,14 +157,13 @@ export async function incomingEvent( : undefined, }; - // if timestamp is from the past we dont want to create a new session + // Server-side and "timestamp from the past" events ride alongside an + // existing client session if there is one — they don't open / close + // sessions of their own. if (uaInfo.isServer || isTimestampFromThePast) { const session = profileId && !isTimestampFromThePast - ? await sessionBuffer.getExistingSession({ - profileId, - projectId, - }) + ? await sessionBuffer.getExistingSession({ profileId, projectId }) : null; const payload = { @@ -205,60 +192,67 @@ export async function incomingEvent( return createEventAndNotify(payload as IServiceEvent, logger, projectId); } - const activeSessionEndJob = await getActiveSessionEndJob( - projectId, - deviceId, - ); - const activeSessionPayload = activeSessionEndJob?.data.payload; - - const payload: IServiceCreateEventPayload = merge(baseEvent, { - referrer: activeSessionPayload?.referrer ?? baseEvent.referrer, - referrerName: activeSessionPayload?.referrerName ?? baseEvent.referrerName, - referrerType: activeSessionPayload?.referrerType ?? baseEvent.referrerType, - } as Partial) as IServiceCreateEventPayload; - - const isExcluded = await isEventExcludedByProjectFilter(payload, projectId); - if (isExcluded) { + if (await isEventExcludedByProjectFilter(baseEvent, projectId)) { logger.info( - { event: payload.name, projectId }, - 'Skipping session_start and event (excluded by project filter)', + { event: baseEvent.name, projectId }, + 'Skipping session_start and event (excluded by project filter)' ); return null; } - if (activeSessionEndJob) { - await extendSessionEndJob({ - projectId, - deviceId, - job: activeSessionEndJob, - }).catch((error) => { - logger.warn({ err: error }, 'Failed to extend session end job'); - }); - } else { + // The single source of truth for session lifecycle. Reads the current + // session, decides extend/new/boundary, writes back. The returned + // `current` is the canonical session — use its referrer fields for + // inheritance, just like the previous behavior. + const session = await sessionBuffer.ingest(baseEvent); + + if (session?.kind === 'boundary') { + // Close the old session in a separate job (one Redis-buffered insert + // for the session_end event + notification rule check). Idempotent via + // BullMQ jobId dedup. + await enqueueSessionEndV2({ + payload: baseEvent, + closedSession: session.closed, + }) + .then(() => sessionEndsEnqueued.inc({ source: 'boundary' })) + .catch((error) => { + logger.error( + { err: error, deviceId, sessionId: session.closed.id }, + 'Error enqueueing session_end on boundary' + ); + }); + } + + if (session?.kind === 'new' || session?.kind === 'boundary') { + sessionsStarted.inc({ kind: session.kind }); await createEventAndNotify( { - ...payload, + ...baseEvent, name: 'session_start', - createdAt: new Date(getTime(payload.createdAt) - 100), + createdAt: new Date(getTime(baseEvent.createdAt) - 100), }, logger, projectId ).catch((error) => { logger.error( - { err: error, event: payload }, - 'Error creating session start event', - ); - throw error; - }); - - await createSessionEndJob({ payload }).catch((error) => { - logger.error( - { err: error, event: payload }, - 'Error creating session end job', + { err: error, event: baseEvent }, + 'Error creating session start event' ); throw error; }); } - return createEventAndNotify(payload, logger, projectId); + // Inherit referrer fields from the canonical session for the actual event. + // For 'extend' this preserves the original session's referrer across + // mid-session events; for 'new' / 'boundary' it's the event's own referrer + // (which `ingest` just stored on the fresh session). + const finalPayload: IServiceCreateEventPayload = session + ? merge(baseEvent, { + referrer: session.current.referrer, + referrerName: session.current.referrer_name, + referrerType: session.current.referrer_type, + } as Partial) + : baseEvent; + + return createEventAndNotify(finalPayload, logger, projectId); } diff --git a/apps/worker/src/jobs/events.incoming-events.test.ts b/apps/worker/src/jobs/events.incoming-events.test.ts index 30547d7dc..d1a6e7af9 100644 --- a/apps/worker/src/jobs/events.incoming-events.test.ts +++ b/apps/worker/src/jobs/events.incoming-events.test.ts @@ -23,15 +23,13 @@ vi.mock('@openpanel/db', async () => { matchEvent: vi.fn().mockReturnValue(false), sessionBuffer: { getExistingSession: vi.fn(), + ingest: vi.fn(), }, }; }); -// 30 minutes -const SESSION_TIMEOUT = 30 * 60 * 1000; const projectId = 'test-project'; const deviceId = 'device-123'; -// Valid UUID used when creating a new session in tests const newSessionId = 'a1b2c3d4-e5f6-4789-a012-345678901234'; const geo = { country: 'US', @@ -63,194 +61,165 @@ const uaInfoServer: EventsQueuePayloadIncomingEvent['payload']['uaInfo'] = { model: '', }; +function makeSession( + overrides: Partial = {} +): IClickhouseSession { + const now = new Date(); + return { + id: 'session-existing', + project_id: projectId, + device_id: deviceId, + profile_id: '', + event_count: 1, + screen_view_count: 0, + entry_path: '/', + entry_origin: 'https://example.com', + exit_path: '/', + exit_origin: 'https://example.com', + created_at: formatClickhouseDate(now), + ended_at: formatClickhouseDate(now), + os: 'Windows', + os_version: '10', + browser: 'Chrome', + browser_version: '91.0.4472.124', + device: 'desktop', + brand: '', + model: '', + country: 'US', + region: 'NY', + city: 'New York', + longitude: 0, + latitude: 0, + duration: 0, + referrer: '', + referrer_name: '', + referrer_type: '', + is_bounce: true, + utm_term: '', + utm_source: '', + utm_campaign: '', + utm_content: '', + utm_medium: '', + revenue: 0, + sign: 1, + version: 1, + groups: [], + ...overrides, + } satisfies IClickhouseSession; +} + +function buildJobData( + overrides: Partial = {} +): EventsQueuePayloadIncomingEvent['payload'] { + return { + geo, + event: { + name: 'test_event', + timestamp: new Date().toISOString(), + isTimestampFromThePast: false, + properties: { __path: 'https://example.com/test' }, + }, + uaInfo, + headers: { + 'request-id': '123', + 'user-agent': 'Mozilla/5.0', + 'openpanel-sdk-name': 'web', + 'openpanel-sdk-version': '1.0.0', + }, + projectId, + deviceId, + sessionId: newSessionId, + ...overrides, + }; +} + describe('incomingEvent', () => { beforeEach(() => { vi.clearAllMocks(); + (createEvent as Mock).mockImplementation((event) => event); }); - it('should create a session start and an event', async () => { - const spySessionsQueueAdd = vi - .spyOn(sessionsQueue, 'add') - .mockResolvedValue({} as Job); - const timestamp = new Date(); - // Mock job data - const jobData: EventsQueuePayloadIncomingEvent['payload'] = { - geo, - event: { - name: 'test_event', - timestamp: timestamp.toISOString(), - isTimestampFromThePast: false, - properties: { __path: 'https://example.com/test' }, - }, - uaInfo, - headers: { - 'request-id': '123', - 'user-agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'openpanel-sdk-name': 'web', - 'openpanel-sdk-version': '1.0.0', - }, - projectId, - deviceId, - sessionId: newSessionId, - }; - const event = { - name: 'test_event', - deviceId, - profileId: '', - sessionId: expect.stringMatching( - // biome-ignore lint/performance/useTopLevelRegex: test - /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i - ), - projectId, - properties: { - __hash: undefined, - __query: undefined, - }, - createdAt: timestamp, - country: 'US', - city: 'New York', - region: 'NY', - revenue: undefined, - longitude: 0, - latitude: 0, - os: 'Windows', - osVersion: '10', - browser: 'Chrome', - browserVersion: '91.0.4472.124', - device: 'desktop', - brand: '', - model: '', - duration: 0, - path: '/test', - origin: 'https://example.com', - referrer: '', - referrerName: '', - referrerType: '', - sdkName: jobData.headers['openpanel-sdk-name'], - sdkVersion: jobData.headers['openpanel-sdk-version'], - groups: [], - }; + it('emits session_start when ingest returns kind="new"', async () => { + vi.mocked(sessionBuffer.ingest).mockResolvedValueOnce({ + kind: 'new', + current: makeSession({ id: newSessionId }), + }); - (createEvent as Mock).mockReturnValue(event); + await incomingEvent(buildJobData()); - // Execute the job - await incomingEvent(jobData); + expect(sessionsQueue.add).not.toHaveBeenCalled(); + const calls = (createEvent as Mock).mock.calls; + expect(calls).toHaveLength(2); + expect(calls[0]![0].name).toBe('session_start'); + expect(calls[1]![0].name).toBe('test_event'); + }); - expect(spySessionsQueueAdd).toHaveBeenCalledWith( - 'session', - { - type: 'createSessionEnd', - payload: expect.objectContaining(event), - }, - { - delay: SESSION_TIMEOUT, - jobId: `sessionEnd:${projectId}:${deviceId}`, - attempts: 3, - backoff: { - delay: 200, - type: 'exponential', - }, - } + it('skips session_start when ingest returns kind="extend"', async () => { + vi.mocked(sessionBuffer.ingest).mockResolvedValueOnce({ + kind: 'extend', + current: makeSession(), + }); + + await incomingEvent(buildJobData()); + + expect(sessionsQueue.add).not.toHaveBeenCalled(); + const sessionStartCalls = (createEvent as Mock).mock.calls.filter( + ([arg]) => arg?.name === 'session_start' ); + expect(sessionStartCalls).toHaveLength(0); + }); - expect((createEvent as Mock).mock.calls[0]![0]).toStrictEqual({ - ...event, - createdAt: new Date(timestamp.getTime() - 100), - name: 'session_start', + it('closes old session and emits session_start when ingest returns kind="boundary"', async () => { + const closed = makeSession({ id: 'old-session-id' }); + const current = makeSession({ id: newSessionId }); + vi.mocked(sessionBuffer.ingest).mockResolvedValueOnce({ + kind: 'boundary', + current, + closed, }); - expect((createEvent as Mock).mock.calls[1]).toMatchObject([event]); - }); - it('should reuse existing session', async () => { - const spySessionsQueueAdd = vi.spyOn(sessionsQueue, 'add'); - const spySessionsQueueGetJob = vi.spyOn(sessionsQueue, 'getJob'); + const spy = vi + .spyOn(sessionsQueue, 'add') + .mockResolvedValue({} as Job); - const timestamp = new Date(); - // Mock job data - const jobData: EventsQueuePayloadIncomingEvent['payload'] = { - geo, - event: { - name: 'test_event', - timestamp: timestamp.toISOString(), - properties: { __path: 'https://example.com/test' }, - isTimestampFromThePast: false, - }, - headers: { - 'request-id': '123', - 'user-agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'openpanel-sdk-name': 'web', - 'openpanel-sdk-version': '1.0.0', - }, - uaInfo, - projectId, - deviceId, - sessionId: 'session-123', - }; + await incomingEvent(buildJobData()); - const changeDelay = vi.fn(); - const updateData = vi.fn(); - spySessionsQueueGetJob.mockResolvedValueOnce({ - getState: vi.fn().mockResolvedValue('delayed'), - updateData, - changeDelay, - data: { - type: 'createSessionEnd', - payload: { - sessionId: 'session-123', - deviceId, - }, - }, - } as Partial as Job); - // Execute the job - await incomingEvent(jobData); + expect(spy).toHaveBeenCalledTimes(1); + const [, payload, opts] = spy.mock.calls[0]!; + expect((payload as any).type).toBe('createSessionEnd'); + expect((payload as any).payload.sessionId).toBe('old-session-id'); + expect(opts?.jobId).toBe( + `sessionEnd:v2:${projectId}:${deviceId}:old-session-id` + ); - const event = { - name: 'test_event', - deviceId, - profileId: '', - sessionId: 'session-123', - projectId, - properties: { - __hash: undefined, - __query: undefined, - }, - createdAt: timestamp, - country: 'US', - city: 'New York', - region: 'NY', - revenue: undefined, - longitude: 0, - latitude: 0, - os: 'Windows', - osVersion: '10', - browser: 'Chrome', - browserVersion: '91.0.4472.124', - device: 'desktop', - brand: '', - model: '', - duration: 0, - path: '/test', - origin: 'https://example.com', - referrer: '', - referrerName: '', - referrerType: '', - sdkName: jobData.headers['openpanel-sdk-name'], - sdkVersion: jobData.headers['openpanel-sdk-version'], - groups: [], - }; + const calls = (createEvent as Mock).mock.calls; + expect(calls.filter(([a]) => a?.name === 'session_start')).toHaveLength(1); + expect(calls.filter(([a]) => a?.name === 'test_event')).toHaveLength(1); + }); + + it('inherits referrer from current session on the actual event', async () => { + vi.mocked(sessionBuffer.ingest).mockResolvedValueOnce({ + kind: 'extend', + current: makeSession({ + referrer: 'https://google.com', + referrer_name: 'Google', + referrer_type: 'search', + }), + }); + + await incomingEvent(buildJobData()); - expect(spySessionsQueueAdd).toHaveBeenCalledTimes(0); - expect(changeDelay).toHaveBeenCalledWith(SESSION_TIMEOUT); - expect(createEvent as Mock).toBeCalledTimes(1); - expect((createEvent as Mock).mock.calls[0]![0]).toStrictEqual(event); + const testEventCall = (createEvent as Mock).mock.calls.find( + ([a]) => a?.name === 'test_event' + ); + expect(testEventCall![0].referrer).toBe('https://google.com'); + expect(testEventCall![0].referrerName).toBe('Google'); + expect(testEventCall![0].referrerType).toBe('search'); }); - it('should handle server events (with existing screen view)', async () => { + it('handles server events with existing profile session', async () => { const timestamp = new Date(); - const jobData: EventsQueuePayloadIncomingEvent['payload'] = { - geo, + const jobData = buildJobData({ event: { name: 'server_event', timestamp: timestamp.toISOString(), @@ -258,231 +227,109 @@ describe('incomingEvent', () => { profileId: 'profile-123', isTimestampFromThePast: false, }, + uaInfo: uaInfoServer, + deviceId: '', + sessionId: '', headers: { 'user-agent': 'OpenPanel Server/1.0', 'openpanel-sdk-name': 'server', 'openpanel-sdk-version': '1.0.0', 'request-id': '123', }, - projectId, - deviceId: '', - sessionId: '', - uaInfo: uaInfoServer, - }; + }); - vi.mocked(sessionBuffer.getExistingSession).mockResolvedValueOnce({ - id: 'last-session-456', - event_count: 0, - screen_view_count: 0, - entry_path: '/last-path', - entry_origin: 'https://example.com', - exit_path: '/last-path', - exit_origin: 'https://example.com', - created_at: formatClickhouseDate(timestamp), - ended_at: formatClickhouseDate(timestamp), - os: 'iOS', - os_version: '15.0', - browser: 'Safari', - browser_version: '15.0', - device: 'mobile', - brand: 'Apple', - model: 'iPhone', - country: 'CA', - region: 'ON', - city: 'Toronto', - longitude: 0, - latitude: 0, - duration: 0, - referrer: 'https://google.com', - referrer_name: 'Google', - referrer_type: 'search', - is_bounce: false, - utm_term: '', - utm_source: '', - utm_campaign: '', - utm_content: '', - utm_medium: '', - revenue: 0, - project_id: projectId, - device_id: 'last-device-123', - profile_id: 'profile-123', - sign: 1, - version: 1, - groups: [], - } satisfies IClickhouseSession); + vi.mocked(sessionBuffer.getExistingSession).mockResolvedValueOnce( + makeSession({ + id: 'last-session-456', + device_id: 'last-device-123', + profile_id: 'profile-123', + os: 'iOS', + os_version: '15.0', + browser: 'Safari', + browser_version: '15.0', + device: 'mobile', + brand: 'Apple', + model: 'iPhone', + country: 'CA', + region: 'ON', + city: 'Toronto', + entry_path: '/last-path', + entry_origin: 'https://example.com', + exit_path: '/last-path', + exit_origin: 'https://example.com', + referrer: 'https://google.com', + referrer_name: 'Google', + referrer_type: 'search', + is_bounce: false, + event_count: 0, + screen_view_count: 0, + }) + ); await incomingEvent(jobData); - expect((createEvent as Mock).mock.calls[0]![0]).toStrictEqual({ + expect(sessionBuffer.ingest).not.toHaveBeenCalled(); + expect(sessionsQueue.add).not.toHaveBeenCalled(); + expect((createEvent as Mock).mock.calls[0]![0]).toMatchObject({ name: 'server_event', deviceId: 'last-device-123', sessionId: 'last-session-456', profileId: 'profile-123', - projectId, - properties: { - custom_property: 'test_value', - __hash: undefined, - __query: undefined, - }, - createdAt: timestamp, - country: 'CA', city: 'Toronto', - region: 'ON', - longitude: 0, - latitude: 0, - os: 'iOS', - osVersion: '15.0', - browser: 'Safari', - browserVersion: '15.0', - device: 'mobile', - brand: 'Apple', - model: 'iPhone', - duration: 0, - path: '/last-path', - origin: 'https://example.com', + country: 'CA', referrer: 'https://google.com', - referrerName: 'Google', - referrerType: 'search', - sdkName: 'server', - sdkVersion: '1.0.0', - revenue: undefined, - groups: [], }); - - expect(sessionsQueue.add).not.toHaveBeenCalled(); }); - it('should handle server events (without existing screen view)', async () => { - const timestamp = new Date(); - const jobData: EventsQueuePayloadIncomingEvent['payload'] = { - geo, - event: { - name: 'server_event', - timestamp: timestamp.toISOString(), - properties: { custom_property: 'test_value' }, - profileId: 'profile-123', - isTimestampFromThePast: false, - }, - headers: { - 'user-agent': 'OpenPanel Server/1.0', - 'openpanel-sdk-name': 'server', - 'openpanel-sdk-version': '1.0.0', - 'request-id': '123', - }, - projectId, - deviceId: '', - sessionId: '', - uaInfo: uaInfoServer, - }; + it('handles server events without any active session', async () => { + vi.mocked(sessionBuffer.getExistingSession).mockResolvedValueOnce(null); - await incomingEvent(jobData); + await incomingEvent( + buildJobData({ + event: { + name: 'server_event', + timestamp: new Date().toISOString(), + properties: { custom_property: 'test_value' }, + profileId: 'profile-123', + isTimestampFromThePast: false, + }, + uaInfo: uaInfoServer, + deviceId: '', + sessionId: '', + headers: { + 'user-agent': 'OpenPanel Server/1.0', + 'openpanel-sdk-name': 'server', + 'openpanel-sdk-version': '1.0.0', + 'request-id': '123', + }, + }) + ); - expect((createEvent as Mock).mock.calls[0]![0]).toStrictEqual({ + expect(sessionBuffer.ingest).not.toHaveBeenCalled(); + expect(sessionsQueue.add).not.toHaveBeenCalled(); + expect((createEvent as Mock).mock.calls[0]![0]).toMatchObject({ name: 'server_event', deviceId: '', sessionId: '', profileId: 'profile-123', - projectId, - properties: { - custom_property: 'test_value', - __hash: undefined, - __query: undefined, - }, - createdAt: timestamp, - country: 'US', - city: 'New York', - region: 'NY', - revenue: undefined, - longitude: 0, - latitude: 0, - os: '', - osVersion: '', - browser: '', - browserVersion: '', - device: 'server', - brand: '', - model: '', - duration: 0, - path: '', - origin: '', - referrer: undefined, - referrerName: undefined, - referrerType: undefined, - sdkName: 'server', - sdkVersion: '1.0.0', - groups: [], }); - - expect(sessionsQueue.add).not.toHaveBeenCalled(); }); - it('should emit session_start only once when 3 events arrive in rapid succession', async () => { - // Regression test: previously the API baked `session: undefined` into every - // payload when no session-end job existed yet. Even with sequential - // per-device processing in the worker, the worker re-checks the BullMQ - // session-end job at processing time, so events 2 and 3 should extend - // rather than emit duplicate session_starts. - const spySessionsQueueAdd = vi - .spyOn(sessionsQueue, 'add') - .mockResolvedValue({} as Job); - const spySessionsQueueGetJob = vi.spyOn(sessionsQueue, 'getJob'); + it('emits session_start only once across 3 rapid events (new → extend → extend)', async () => { + const session = makeSession({ id: newSessionId }); + vi.mocked(sessionBuffer.ingest) + .mockResolvedValueOnce({ kind: 'new', current: session }) + .mockResolvedValueOnce({ kind: 'extend', current: session }) + .mockResolvedValueOnce({ kind: 'extend', current: session }); - const buildJobData = ( - eventName: string, - ): EventsQueuePayloadIncomingEvent['payload'] => ({ - geo, - event: { - name: eventName, - timestamp: new Date().toISOString(), - isTimestampFromThePast: false, - properties: { __path: 'https://example.com/test' }, - }, - uaInfo, - headers: { - 'request-id': '123', - 'user-agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'openpanel-sdk-name': 'web', - 'openpanel-sdk-version': '1.0.0', - }, - projectId, - deviceId, - sessionId: newSessionId, - }); - - // Event 1: no session-end job exists yet → emit session_start. - spySessionsQueueGetJob.mockResolvedValueOnce(undefined); - // Events 2 and 3: session-end job is now present (delayed) → extend only. - const liveJob = { - id: `sessionEnd:${projectId}:${deviceId}`, - getState: vi.fn().mockResolvedValue('delayed'), - changeDelay: vi.fn(), - data: { - type: 'createSessionEnd', - payload: { - sessionId: newSessionId, - deviceId, - referrer: '', - referrerName: '', - referrerType: '', - }, - }, - } as Partial as Job; - spySessionsQueueGetJob.mockResolvedValue(liveJob); - - (createEvent as Mock).mockImplementation((event) => event); - - await incomingEvent(buildJobData('event_a')); - await incomingEvent(buildJobData('event_b')); - await incomingEvent(buildJobData('event_c')); + await incomingEvent(buildJobData({ event: { name: 'e1', timestamp: new Date().toISOString(), isTimestampFromThePast: false, properties: { __path: 'https://example.com/test' } } })); + await incomingEvent(buildJobData({ event: { name: 'e2', timestamp: new Date().toISOString(), isTimestampFromThePast: false, properties: { __path: 'https://example.com/test' } } })); + await incomingEvent(buildJobData({ event: { name: 'e3', timestamp: new Date().toISOString(), isTimestampFromThePast: false, properties: { __path: 'https://example.com/test' } } })); const sessionStartCalls = (createEvent as Mock).mock.calls.filter( - ([arg]) => arg?.name === 'session_start', + ([a]) => a?.name === 'session_start' ); expect(sessionStartCalls).toHaveLength(1); - - // Only the first event should have queued a session-end job; subsequent - // events extend the existing one via changeDelay. - expect(spySessionsQueueAdd).toHaveBeenCalledTimes(1); + expect(sessionsQueue.add).not.toHaveBeenCalled(); }); }); diff --git a/apps/worker/src/metrics.ts b/apps/worker/src/metrics.ts index a197d9865..7506f428a 100644 --- a/apps/worker/src/metrics.ts +++ b/apps/worker/src/metrics.ts @@ -8,6 +8,7 @@ import { sessionBuffer, } from '@openpanel/db'; import { cronQueue, eventsGroupQueues, sessionsQueue } from '@openpanel/queue'; +import { getRedisCache } from '@openpanel/redis'; import client from 'prom-client'; const Registry = client.Registry; @@ -124,7 +125,7 @@ for (const buf of allBuffers) { // ignore — scrape continues } }, - }), + }) ); } @@ -141,7 +142,7 @@ register.registerMetric( // ignore } }, - }), + }) ); // ---- Flush metrics (populated via flushObserver hooks) ---- @@ -150,7 +151,9 @@ const flushDuration = new client.Histogram({ name: 'buffer_flush_duration_ms', help: 'Wall time of a tryFlush call, including lock acquisition', labelNames: ['buffer', 'result', 'trigger'], - buckets: [5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10_000, 30_000, 60_000], + buckets: [ + 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10_000, 30_000, 60_000, + ], }); register.registerMetric(flushDuration); @@ -181,7 +184,9 @@ const chInsertDurationMs = new client.Histogram({ name: 'buffer_ch_insert_duration_ms', help: 'Duration of the ClickHouse insert(s) inside a single flush', labelNames: ['buffer'], - buckets: [10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10_000, 30_000, 60_000], + buckets: [ + 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10_000, 30_000, 60_000, + ], }); register.registerMetric(chInsertDurationMs); @@ -193,7 +198,9 @@ const chFetchDurationMs = new client.Histogram({ name: 'buffer_ch_fetch_duration_ms', help: 'Duration of CH SELECT(s) inside a single flush (e.g. profile merge fetch)', labelNames: ['buffer'], - buckets: [10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10_000, 30_000, 60_000], + buckets: [ + 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10_000, 30_000, 60_000, + ], }); register.registerMetric(chFetchDurationMs); @@ -201,7 +208,10 @@ const flushLlenAtStart = new client.Histogram({ name: 'buffer_flush_llen_at_start', help: 'LLEN observed at the start of a flush attempt', labelNames: ['buffer'], - buckets: [0, 10, 100, 500, 1000, 5000, 10_000, 50_000, 100_000, 500_000, 1_000_000, 5_000_000], + buckets: [ + 0, 10, 100, 500, 1000, 5000, 10_000, 50_000, 100_000, 500_000, 1_000_000, + 5_000_000, + ], }); register.registerMetric(flushLlenAtStart); @@ -238,7 +248,7 @@ for (const buf of allBuffers) { }); flushDuration.observe( { buffer: obs.buffer, result: obs.result, trigger: obs.trigger }, - obs.totalMs, + obs.totalMs ); if (obs.llenAtStart != null) { @@ -252,26 +262,20 @@ for (const buf of allBuffers) { if (obs.phases?.lrangeMs != null) { redisOpDurationMs.observe( { buffer: obs.buffer, op: 'lrange' }, - obs.phases.lrangeMs, + obs.phases.lrangeMs ); } if (obs.phases?.trimMs != null) { redisOpDurationMs.observe( { buffer: obs.buffer, op: 'trim' }, - obs.phases.trimMs, + obs.phases.trimMs ); } if (obs.phases?.chFetchMs != null) { - chFetchDurationMs.observe( - { buffer: obs.buffer }, - obs.phases.chFetchMs, - ); + chFetchDurationMs.observe({ buffer: obs.buffer }, obs.phases.chFetchMs); } if (obs.phases?.chInsertMs != null) { - chInsertDurationMs.observe( - { buffer: obs.buffer }, - obs.phases.chInsertMs, - ); + chInsertDurationMs.observe({ buffer: obs.buffer }, obs.phases.chInsertMs); } }; @@ -288,3 +292,152 @@ for (const buf of allBuffers) { addDurationMs.observe({ buffer: obs.buffer }, obs.durationMs); }; } + +register.registerMetric( + new client.Gauge({ + name: `buffer_${replayBuffer.name}_count`, + help: 'Number of unprocessed replay chunks', + async collect() { + const metric = await replayBuffer.getBufferSize(); + this.set(metric); + }, + }) +); + +// ----------------------------------------------------------------------- +// Session lifecycle metrics (new session-buffer + reaper world) +// ----------------------------------------------------------------------- + +// Counters incremented at runtime by the session lifecycle code paths. + +export const sessionsStarted = new client.Counter({ + name: 'sessions_started_total', + help: 'session_start events emitted, by lifecycle kind', + labelNames: ['kind'], // 'new' | 'boundary' +}); +register.registerMetric(sessionsStarted); + +export const sessionEndsEnqueued = new client.Counter({ + name: 'session_ends_enqueued_total', + help: 'session_end jobs pushed onto the sessions queue, by trigger source', + labelNames: ['source'], // 'boundary' | 'reaper' +}); +register.registerMetric(sessionEndsEnqueued); + +export const sessionEndsEmitted = new client.Counter({ + name: 'session_ends_emitted_total', + help: 'session_end events actually written (post-idempotency claim)', +}); +register.registerMetric(sessionEndsEmitted); + +export const sessionEndsSkipped = new client.Counter({ + name: 'session_ends_skipped_total', + help: 'session_end jobs that ran but did not emit, by reason', + labelNames: ['reason'], // 'not_found' | 'already_emitted' +}); +register.registerMetric(sessionEndsSkipped); + +export const sessionsReaped = new client.Counter({ + name: 'sessions_reaped_total', + help: 'Sessions closed by the reaper, by trigger condition', + labelNames: ['reason'], // 'event-time' | 'deadman' +}); +register.registerMetric(sessionsReaped); + +export const sessionDurationOnClose = new client.Histogram({ + name: 'session_duration_ms_on_close', + help: 'Duration of closed sessions (ms)', + buckets: [ + 1000, // 1s + 10_000, // 10s + 60_000, // 1m + 5 * 60_000, // 5m + 15 * 60_000, // 15m + 30 * 60_000, // 30m + 60 * 60_000, // 1h + 24 * 60 * 60_000, // 24h + ], +}); +register.registerMetric(sessionDurationOnClose); + +export const sessionEventsOnClose = new client.Histogram({ + name: 'session_events_on_close', + help: 'Total events (event_count + screen_view_count) on session close', + buckets: [1, 2, 5, 10, 25, 50, 100, 250, 500, 1000], +}); +register.registerMetric(sessionEventsOnClose); + +// Gauges polled at scrape time. The active-sessions gauge does one ZCARD +// per project — cheap individually, fine for hundreds of projects. + +register.registerMetric( + new client.Gauge({ + name: 'sessions_active_total', + help: 'Active sessions in Redis across all projects', + async collect() { + const redis = getRedisCache(); + const projectIds = await redis.smembers('session:projects'); + if (projectIds.length === 0) { + this.set(0); + return; + } + const multi = redis.multi(); + for (const pid of projectIds) { + multi.zcard(`session:active:${pid}`); + } + const results = await multi.exec(); + let total = 0; + for (const entry of results ?? []) { + const count = Number(entry?.[1] ?? 0); + if (Number.isFinite(count)) { + total += count; + } + } + this.set(total); + }, + }) +); + +register.registerMetric( + new client.Gauge({ + name: 'sessions_projects_active', + help: 'Projects with at least one active session', + async collect() { + const redis = getRedisCache(); + this.set(await redis.scard('session:projects')); + }, + }) +); + +register.registerMetric( + new client.Gauge({ + name: 'sessions_hwm_lag_ms', + help: 'Max lag (ms) between wall-clock now and project event-time HWM, across all projects. Big number → queue lag or imports.', + async collect() { + const redis = getRedisCache(); + const projectIds = await redis.smembers('session:projects'); + if (projectIds.length === 0) { + this.set(0); + return; + } + const multi = redis.multi(); + for (const pid of projectIds) { + multi.get(`session:hwm:${pid}`); + } + const results = await multi.exec(); + const now = Date.now(); + let maxLag = 0; + for (const entry of results ?? []) { + const hwm = Number(entry?.[1] ?? 0); + if (!Number.isFinite(hwm) || hwm <= 0) { + continue; + } + const lag = now - hwm; + if (lag > maxLag) { + maxLag = lag; + } + } + this.set(maxLag); + }, + }) +); diff --git a/apps/worker/src/utils/session-handler.ts b/apps/worker/src/utils/session-handler.ts index 0ca4c11ed..aea312709 100644 --- a/apps/worker/src/utils/session-handler.ts +++ b/apps/worker/src/utils/session-handler.ts @@ -1,98 +1,59 @@ -import type { IServiceCreateEventPayload } from '@openpanel/db'; -import { - type EventsQueuePayloadCreateSessionEnd, - sessionsQueue, -} from '@openpanel/queue'; -import { LRUCache } from '@openpanel/redis'; -import type { Job } from 'bullmq'; -import { logger as baseLogger } from '@/utils/logger'; +import type { + IClickhouseSession, + IServiceCreateEventPayload, +} from '@openpanel/db'; +import { SESSION_TIMEOUT_MS } from '@openpanel/db'; +import { sessionsQueue } from '@openpanel/queue'; -export const SESSION_TIMEOUT = 1000 * 60 * 30; +export { SESSION_TIMEOUT_MS }; +export const SESSION_TIMEOUT = SESSION_TIMEOUT_MS; -const CHANGE_DELAY_THROTTLE_MS = process.env.CHANGE_DELAY_THROTTLE_MS - ? Number.parseInt(process.env.CHANGE_DELAY_THROTTLE_MS, 10) - : 60_000; // 1 minute - -const CHANGE_DELAY_THROTTLE_MAP = new LRUCache({ - max: 100_000, - ttl: SESSION_TIMEOUT, -}); - -export type SessionEndJob = Job; - -// Treats any existing session-end job as "session in progress" regardless of -// state — avoids duplicate session_starts when the close job is mid-flight. -export async function getActiveSessionEndJob( +/** + * Deterministic v2 jobId for a closed session. + * + * Includes the session's stable `id` so concurrent / retried closes for the + * same logical session dedupe in BullMQ. The `v2:` prefix lets the legacy + * drain script differentiate from pre-migration delayed jobs. + */ +export const getSessionEndJobIdV2 = ( projectId: string, deviceId: string, -): Promise { - const jobId = getSessionEndJobId(projectId, deviceId); - const job = await sessionsQueue.getJob(jobId); - return (job as SessionEndJob | undefined) ?? null; -} - -export async function extendSessionEndJob({ - projectId, - deviceId, - job, -}: { - projectId: string; - deviceId: string; - job?: SessionEndJob | null; -}) { - const last = CHANGE_DELAY_THROTTLE_MAP.get(`${projectId}:${deviceId}`) ?? 0; - const isThrottled = Date.now() - last < CHANGE_DELAY_THROTTLE_MS; - - if (isThrottled) { - return; - } - - const resolvedJob = - job ?? (await getActiveSessionEndJob(projectId, deviceId)); + sessionInternalId: string +) => `sessionEnd:v2:${projectId}:${deviceId}:${sessionInternalId}`; - if (!resolvedJob) { - return; - } - - const state = await resolvedJob.getState(); - if (state !== 'delayed') { - baseLogger.warn( - { jobId: resolvedJob.id, state }, - 'Session end job is not in delayed state, skipping extend', - ); - return; - } - - // Narrow TOCTOU window: state could flip from delayed → waiting between - // the getState() check above and this call. - try { - await resolvedJob.changeDelay(SESSION_TIMEOUT); - CHANGE_DELAY_THROTTLE_MAP.set(`${projectId}:${deviceId}`, Date.now()); - } catch (error) { - baseLogger.warn( - { err: error, jobId: resolvedJob.id }, - 'Session end job moved out of delayed state during extend', - ); - } -} - -const getSessionEndJobId = (projectId: string, deviceId: string) => - `sessionEnd:${projectId}:${deviceId}`; - -export function createSessionEndJob({ +/** + * Enqueue a session_end job. Idempotent via jobId. + * + * Called when a boundary is detected during ingest (old session must close) + * or by the reaper when a session has been idle past the timeout. + */ +export async function enqueueSessionEndV2({ payload, + closedSession, }: { payload: IServiceCreateEventPayload; + closedSession: IClickhouseSession; }) { + const jobId = getSessionEndJobIdV2( + closedSession.project_id, + closedSession.device_id, + closedSession.id + ); + return sessionsQueue.add( 'session', { type: 'createSessionEnd', - payload, + payload: { + ...payload, + projectId: closedSession.project_id, + deviceId: closedSession.device_id, + sessionId: closedSession.id, + profileId: closedSession.profile_id || payload.profileId, + }, }, { - delay: SESSION_TIMEOUT, - jobId: getSessionEndJobId(payload.projectId, payload.deviceId), + jobId, attempts: 3, backoff: { type: 'exponential', diff --git a/packages/db/scripts/drain-old-session-jobs.ts b/packages/db/scripts/drain-old-session-jobs.ts new file mode 100644 index 000000000..3fd539170 --- /dev/null +++ b/packages/db/scripts/drain-old-session-jobs.ts @@ -0,0 +1,79 @@ +// One-off cleanup: drain legacy `sessionEnd:{projectId}:{deviceId}` delayed +// jobs from the `sessions` queue. Run AFTER: +// +// 1. The new code is deployed (worker emits reaper-enqueued jobs with the +// `sessionEnd:v2:*` jobId prefix). +// 2. The `events.incoming-event` queue has been resumed and you've verified +// session metrics look healthy under the new flow. +// +// This removes only jobs whose jobId is `sessionEnd:*` and NOT `sessionEnd:v2:*`, +// leaving any reaper-enqueued v2 jobs intact. After this script runs you can +// safely resume the `sessions` queue. +// +// Idempotent: re-running on an already-drained queue is a no-op. +// +// Usage (from packages/db): +// pnpm with-env tsx scripts/drain-old-session-jobs.ts +// pnpm with-env tsx scripts/drain-old-session-jobs.ts --dry-run + +import { sessionsQueue } from '@openpanel/queue'; + +async function main() { + const dryRun = process.argv.includes('--dry-run'); + + console.log( + `[drain-old-session-jobs] starting${dryRun ? ' (DRY RUN — no removes)' : ''}` + ); + + const jobs = await sessionsQueue.getJobs(['delayed', 'waiting']); + + const legacy: typeof jobs = []; + const v2: typeof jobs = []; + + for (const job of jobs) { + if (!job.id) continue; + if (job.id.startsWith('sessionEnd:v2:')) { + v2.push(job); + } else if (job.id.startsWith('sessionEnd:')) { + legacy.push(job); + } + } + + console.log( + `[drain-old-session-jobs] found ${jobs.length} jobs total ` + + `(${legacy.length} legacy, ${v2.length} v2, will keep v2)` + ); + + if (dryRun) { + console.log('[drain-old-session-jobs] dry run — exiting'); + process.exit(0); + } + + let removed = 0; + let failed = 0; + + for (const job of legacy) { + try { + await job.remove(); + removed++; + if (removed % 1000 === 0) { + console.log(`[drain-old-session-jobs] removed ${removed}/${legacy.length}`); + } + } catch (error) { + failed++; + console.error('[drain-old-session-jobs] failed to remove job', job.id, error); + } + } + + console.log('[drain-old-session-jobs] done', { + removed, + failed, + kept_v2: v2.length, + }); + process.exit(0); +} + +main().catch((error) => { + console.error('[drain-old-session-jobs] fatal error:', error); + process.exit(1); +}); diff --git a/packages/db/scripts/migrate-sessions.ts b/packages/db/scripts/migrate-sessions.ts new file mode 100644 index 000000000..64fadbb70 --- /dev/null +++ b/packages/db/scripts/migrate-sessions.ts @@ -0,0 +1,155 @@ +// One-off migration: convert legacy `session:{sessionId}` Redis blobs + +// delayed BullMQ `sessionEnd:{projectId}:{deviceId}` jobs into the new +// session-buffer key scheme: +// +// session:{projectId}:{deviceId} -> full session JSON blob +// session:profile:{projectId}:{profileId} -> deviceId pointer +// session:active:{projectId} -> ZSET scored by last event_time (ms) +// session:wallclock:{projectId} -> ZSET scored by wall-clock (ms) +// session:hwm:{projectId} -> monotonic event-time high-water mark +// session:projects -> SET of project_ids with active sessions +// +// Source of truth: the delayed `sessionsQueue` jobs — one per (projectId, +// deviceId) pair currently considered "active" by the legacy system. For +// each, we read the corresponding `session:{sessionId}` blob (still alive +// under its 1h TTL) and rewrite it into the new structure. +// +// Idempotent: re-running the script is safe. Existing new-scheme keys are +// preserved via SET NX semantics inside the multi pipeline. +// +// Usage (from packages/db, env loaded from repo root .env): +// pnpm with-env tsx scripts/migrate-sessions.ts +// pnpm with-env tsx scripts/migrate-sessions.ts --dry-run +// +// Runbook step 2: pause `events.incoming-event` AND `sessions` queues first, +// wait for in-flight jobs to drain (~30s), THEN run this script. Resume +// `events.incoming-event` after deploy; keep `sessions` paused until step 7 +// (drain-old-session-jobs.ts) finishes. + +import { getSafeJson } from '@openpanel/json'; +import { + type EventsQueuePayloadCreateSessionEnd, + sessionsQueue, +} from '@openpanel/queue'; +import { getRedisCache } from '@openpanel/redis'; +import type { IClickhouseSession } from '../src/services/session.service'; + +const SESSION_BLOB_TTL_SECONDS = 60 * 60; +const SESSION_WALLCLOCK_TTL_SECONDS = 60 * 60 * 25; + +const MONOTONIC_HWM_LUA = ` +local cur = redis.call('GET', KEYS[1]) +if not cur or tonumber(ARGV[1]) > tonumber(cur) then + redis.call('SET', KEYS[1], ARGV[1]) + return 1 +end +return 0 +`; + +const sessionKey = (projectId: string, deviceId: string) => + `session:${projectId}:${deviceId}`; +const profileIndexKey = (projectId: string, profileId: string) => + `session:profile:${projectId}:${profileId}`; +const activeSetKey = (projectId: string) => `session:active:${projectId}`; +const wallclockSetKey = (projectId: string) => + `session:wallclock:${projectId}`; +const hwmKey = (projectId: string) => `session:hwm:${projectId}`; +const PROJECTS_SET_KEY = 'session:projects'; + +async function main() { + const dryRun = process.argv.includes('--dry-run'); + const redis = getRedisCache(); + + console.log( + `[migrate-sessions] starting${dryRun ? ' (DRY RUN — no writes)' : ''}` + ); + + // Pull every delayed + waiting `sessionEnd` job from the legacy queue. + // These represent all currently-active sessions in the old system. + const jobs = await sessionsQueue.getJobs(['delayed', 'waiting']); + console.log(`[migrate-sessions] found ${jobs.length} legacy session jobs`); + + const counts = { + migrated: 0, + blobMissing: 0, + alreadyMigrated: 0, + parseFailed: 0, + }; + + for (const job of jobs) { + const data = job.data as EventsQueuePayloadCreateSessionEnd | undefined; + const payload = data?.payload; + if (!payload?.projectId || !payload.deviceId || !payload.sessionId) { + counts.parseFailed++; + continue; + } + const { projectId, deviceId, sessionId } = payload; + + // Skip if already migrated (re-runnable). + const existsAlready = await redis.exists(sessionKey(projectId, deviceId)); + if (existsAlready) { + counts.alreadyMigrated++; + continue; + } + + // Legacy: blob lived at session:{sessionId}. + const blob = await redis.get(`session:${sessionId}`); + if (!blob) { + counts.blobMissing++; + continue; + } + + const session = getSafeJson(blob); + if (!session) { + counts.parseFailed++; + continue; + } + + const lastEventMs = new Date(session.ended_at).getTime(); + const nowMs = Date.now(); + + if (dryRun) { + counts.migrated++; + continue; + } + + const multi = redis.multi(); + multi.set( + sessionKey(projectId, deviceId), + blob, + 'EX', + SESSION_BLOB_TTL_SECONDS + ); + multi.zadd(activeSetKey(projectId), lastEventMs.toString(), deviceId); + multi.zadd(wallclockSetKey(projectId), nowMs.toString(), deviceId); + multi.expire(wallclockSetKey(projectId), SESSION_WALLCLOCK_TTL_SECONDS); + multi.sadd(PROJECTS_SET_KEY, projectId); + multi.eval(MONOTONIC_HWM_LUA, 1, hwmKey(projectId), lastEventMs.toString()); + if (session.profile_id && session.profile_id !== deviceId) { + multi.set( + profileIndexKey(projectId, session.profile_id), + deviceId, + 'EX', + SESSION_BLOB_TTL_SECONDS + ); + } + await multi.exec(); + + counts.migrated++; + + if (counts.migrated % 1000 === 0) { + console.log( + `[migrate-sessions] progress: ${counts.migrated} migrated` + ); + } + } + + console.log('[migrate-sessions] done', counts); + await redis.quit(); + process.exit(0); +} + +main().catch((error) => { + console.error('[migrate-sessions] fatal error:', error); + process.exit(1); +}); diff --git a/packages/db/src/buffers/index.ts b/packages/db/src/buffers/index.ts index 86741b540..5f33fe8d5 100644 --- a/packages/db/src/buffers/index.ts +++ b/packages/db/src/buffers/index.ts @@ -16,3 +16,8 @@ export const groupBuffer = new GroupBuffer(); export type { ProfileBackfillEntry } from './profile-backfill-buffer'; export type { IClickhouseSessionReplayChunk } from './replay-buffer'; +export { + SESSION_TIMEOUT_MS, + SESSION_BLOB_TTL_SECONDS, + type SessionIngestResult, +} from './session-buffer'; diff --git a/packages/db/src/buffers/session-buffer.test.ts b/packages/db/src/buffers/session-buffer.test.ts index f27f0170d..215912896 100644 --- a/packages/db/src/buffers/session-buffer.test.ts +++ b/packages/db/src/buffers/session-buffer.test.ts @@ -11,38 +11,59 @@ vi.mock('../clickhouse/client', () => ({ }, })); +import type { IServiceCreateEventPayload } from '../services/event.service'; import { SessionBuffer } from './session-buffer'; -import type { IClickhouseEvent } from '../services/event.service'; const redis = getRedisCache(); -function makeEvent(overrides: Partial): IClickhouseEvent { +const projectId = 'project-1'; +const deviceId = 'device-1'; +const sessionId = 'session-1'; + +function makePayload( + overrides: Partial = {} +): IServiceCreateEventPayload { return { - id: 'event-1', - project_id: 'project-1', - profile_id: 'profile-1', - device_id: 'device-1', - session_id: 'session-1', name: 'screen_view', + projectId, + deviceId, + sessionId, + profileId: 'profile-1', + properties: {}, + groups: [], + createdAt: new Date(), + duration: 0, + sdkName: 'web', + sdkVersion: '1.0.0', + city: '', + country: '', + region: '', + longitude: 0, + latitude: 0, path: '/home', origin: '', referrer: '', - referrer_name: '', - referrer_type: '', - duration: 0, - properties: {}, - created_at: new Date().toISOString(), - groups: [], + referrerName: '', + referrerType: '', + os: '', + osVersion: '', + browser: '', + browserVersion: '', + device: '', + brand: '', + model: '', ...overrides, - } as IClickhouseEvent; + }; } beforeEach(async () => { const keys = [ - ...await redis.keys('session*'), - ...await redis.keys('lock:session'), + ...(await redis.keys('session*')), + ...(await redis.keys('lock:session')), ]; - if (keys.length > 0) await redis.del(...keys); + if (keys.length > 0) { + await redis.del(...keys); + } vi.mocked(ch.insert).mockResolvedValue(undefined as any); }); @@ -59,37 +80,171 @@ describe('SessionBuffer', () => { sessionBuffer = new SessionBuffer(); }); - it('adds a new session to the buffer', async () => { - const sizeBefore = await sessionBuffer.getBufferSize(); - await sessionBuffer.add(makeEvent({})); - const sizeAfter = await sessionBuffer.getBufferSize(); + it('opens a new session and stores it at session:{pid}:{did}', async () => { + const result = await sessionBuffer.ingest(makePayload()); + + expect(result?.kind).toBe('new'); + + const blob = await redis.get(`session:${projectId}:${deviceId}`); + expect(blob).not.toBeNull(); + const parsed = JSON.parse(blob ?? ''); + expect(parsed.id).toBe(sessionId); + expect(parsed.project_id).toBe(projectId); + expect(parsed.device_id).toBe(deviceId); + }); + + it('registers the device in active+wallclock sorted sets and the projects set', async () => { + await sessionBuffer.ingest(makePayload()); - expect(sizeAfter).toBe(sizeBefore + 1); + expect( + await redis.zscore(`session:active:${projectId}`, deviceId) + ).not.toBeNull(); + expect( + await redis.zscore(`session:wallclock:${projectId}`, deviceId) + ).not.toBeNull(); + expect(await redis.sismember('session:projects', projectId)).toBe(1); + }); + + it('advances session:hwm:{pid} monotonically — never regresses', async () => { + const t0 = new Date('2026-01-01T00:00:00.000Z'); + await sessionBuffer.ingest(makePayload({ createdAt: t0 })); + expect(Number(await redis.get(`session:hwm:${projectId}`))).toBe( + t0.getTime() + ); + + // A later event in event-time advances HWM. + await sessionBuffer.ingest( + makePayload({ + createdAt: new Date(t0.getTime() + 5000), + deviceId: 'device-2', + }) + ); + expect(Number(await redis.get(`session:hwm:${projectId}`))).toBe( + t0.getTime() + 5000 + ); + + // An older event does NOT regress HWM. + await sessionBuffer.ingest( + makePayload({ + createdAt: new Date(t0.getTime() - 5000), + deviceId: 'device-3', + }) + ); + expect(Number(await redis.get(`session:hwm:${projectId}`))).toBe( + t0.getTime() + 5000 + ); + }); + + it('writes profile→device pointer when profile_id differs from device_id', async () => { + await sessionBuffer.ingest( + makePayload({ profileId: 'profile-X', deviceId: 'device-Y' }) + ); + expect(await redis.get(`session:profile:${projectId}:profile-X`)).toBe( + 'device-Y' + ); }); it('skips session_start and session_end events', async () => { const sizeBefore = await sessionBuffer.getBufferSize(); - await sessionBuffer.add(makeEvent({ name: 'session_start' })); - await sessionBuffer.add(makeEvent({ name: 'session_end' })); - const sizeAfter = await sessionBuffer.getBufferSize(); + expect( + await sessionBuffer.ingest(makePayload({ name: 'session_start' })) + ).toBeNull(); + expect( + await sessionBuffer.ingest(makePayload({ name: 'session_end' })) + ).toBeNull(); + expect(await sessionBuffer.getBufferSize()).toBe(sizeBefore); + }); + + it('emits sign=-1 + sign=+1 CH rows when extending within timeout', async () => { + const t0 = new Date(); + await sessionBuffer.ingest(makePayload({ createdAt: t0 })); + const sizeBefore = await sessionBuffer.getBufferSize(); + + const result = await sessionBuffer.ingest( + makePayload({ createdAt: new Date(t0.getTime() + 5000) }) + ); - expect(sizeAfter).toBe(sizeBefore); + expect(result?.kind).toBe('extend'); + expect(await sessionBuffer.getBufferSize()).toBe(sizeBefore + 2); }); - it('updates existing session on subsequent events', async () => { - const t0 = Date.now(); - await sessionBuffer.add(makeEvent({ created_at: new Date(t0).toISOString() })); + it('returns kind=boundary when gap > 30min, with closed + current populated', async () => { + const t0 = new Date(); + const first = await sessionBuffer.ingest(makePayload({ createdAt: t0 })); + expect(first?.kind).toBe('new'); - // Second event updates the same session — emits old (sign=-1) + new (sign=1) - const sizeBefore = await sessionBuffer.getBufferSize(); - await sessionBuffer.add(makeEvent({ created_at: new Date(t0 + 5000).toISOString() })); - const sizeAfter = await sessionBuffer.getBufferSize(); + const second = await sessionBuffer.ingest( + makePayload({ + createdAt: new Date(t0.getTime() + 35 * 60 * 1000), + sessionId: 'session-2', + }) + ); + + expect(second?.kind).toBe('boundary'); + if (second?.kind === 'boundary') { + expect(second.closed.id).toBe(sessionId); + expect(second.current.id).toBe('session-2'); + } + }); + + it('inherits utm_* fields from event.properties.__query', async () => { + await sessionBuffer.ingest( + makePayload({ + properties: { + __query: { + utm_medium: 'cpc', + utm_source: 'google', + utm_campaign: 'spring', + }, + }, + }) + ); + const blob = JSON.parse( + (await redis.get(`session:${projectId}:${deviceId}`)) ?? '' + ); + expect(blob.utm_medium).toBe('cpc'); + expect(blob.utm_source).toBe('google'); + expect(blob.utm_campaign).toBe('spring'); + }); + + it('out-of-order events: older event extends backwards, never produces negative duration', async () => { + const t0 = new Date(); + await sessionBuffer.ingest(makePayload({ createdAt: t0 })); + // Late-arriving event with an older timestamp. + await sessionBuffer.ingest( + makePayload({ createdAt: new Date(t0.getTime() - 30_000) }) + ); + const blob = JSON.parse( + (await redis.get(`session:${projectId}:${deviceId}`)) ?? '' + ); + expect(blob.duration).toBeGreaterThanOrEqual(0); + }); + + it('cleanup() removes blob, sorted set entries, and profile pointer', async () => { + await sessionBuffer.ingest( + makePayload({ profileId: 'profile-X', deviceId: 'device-Y' }) + ); + + await sessionBuffer.cleanup({ + projectId, + deviceId: 'device-Y', + profileId: 'profile-X', + }); - expect(sizeAfter).toBe(sizeBefore + 2); + expect(await redis.get(`session:${projectId}:device-Y`)).toBeNull(); + expect( + await redis.zscore(`session:active:${projectId}`, 'device-Y') + ).toBeNull(); + expect( + await redis.zscore(`session:wallclock:${projectId}`, 'device-Y') + ).toBeNull(); + expect( + await redis.get(`session:profile:${projectId}:profile-X`) + ).toBeNull(); }); it('processes buffer and inserts sessions into ClickHouse', async () => { - await sessionBuffer.add(makeEvent({})); + await sessionBuffer.ingest(makePayload()); const insertSpy = vi .spyOn(ch, 'insert') @@ -107,15 +262,15 @@ describe('SessionBuffer', () => { it('squash does not orphan the creation row when create + updates share a batch', async () => { // Regression: a brand-new session whose creation event and subsequent - // updates all land in one flush window. getSession emits (+1,v1) for the + // updates all land in one flush window. ingest emits (+1,v1) for the // creation then (-1,vN)/(+1,vN+1) pairs for each update. The squash must // net per version so nothing is left un-collapsible: a (-1,V) row only // collapses against a (+1,V) of the SAME version in CH. const t0 = Date.now(); const EVENTS = 8; for (let i = 0; i < EVENTS; i++) { - await sessionBuffer.add( - makeEvent({ created_at: new Date(t0 + i * 1000).toISOString() }), + await sessionBuffer.ingest( + makePayload({ createdAt: new Date(t0 + i * 1000) }) ); } @@ -148,7 +303,7 @@ describe('SessionBuffer', () => { }); it('retains sessions in queue when ClickHouse insert fails', async () => { - await sessionBuffer.add(makeEvent({})); + await sessionBuffer.ingest(makePayload()); const insertSpy = vi .spyOn(ch, 'insert') @@ -158,7 +313,7 @@ describe('SessionBuffer', () => { // counter). processBuffer no longer swallows — we still verify the // safety property: the queue is preserved. await expect(sessionBuffer.processBuffer()).rejects.toThrow( - 'ClickHouse unavailable', + 'ClickHouse unavailable' ); expect(await sessionBuffer.getBufferSize()).toBe(1); diff --git a/packages/db/src/buffers/session-buffer.ts b/packages/db/src/buffers/session-buffer.ts index 01f745c0c..c6652a32d 100644 --- a/packages/db/src/buffers/session-buffer.ts +++ b/packages/db/src/buffers/session-buffer.ts @@ -1,11 +1,62 @@ +import { DateTime } from '@openpanel/common'; import { getSafeJson } from '@openpanel/json'; import { getRedisCache, type Redis } from '@openpanel/redis'; -import { assocPath, clone } from 'ramda'; import { ch, TABLE_NAMES } from '../clickhouse/client'; -import type { IClickhouseEvent } from '../services/event.service'; +import type { IServiceCreateEventPayload } from '../services/event.service'; import type { IClickhouseSession } from '../services/session.service'; import { BaseBuffer } from './base-buffer'; +export const SESSION_TIMEOUT_MS = 1000 * 60 * 30; // 30 minutes +export const SESSION_BLOB_TTL_SECONDS = 60 * 60; // 1 hour +export const SESSION_WALLCLOCK_TTL_SECONDS = 60 * 60 * 25; // 25h, > 24h deadman + +const sessionKey = (projectId: string, deviceId: string) => + `session:${projectId}:${deviceId}`; +const profileIndexKey = (projectId: string, profileId: string) => + `session:profile:${projectId}:${profileId}`; +const activeSetKey = (projectId: string) => `session:active:${projectId}`; +const wallclockSetKey = (projectId: string) => + `session:wallclock:${projectId}`; +const hwmKey = (projectId: string) => `session:hwm:${projectId}`; +const PROJECTS_SET_KEY = 'session:projects'; + +// Monotonic-set Lua: only update if new value is greater than current. +const MONOTONIC_HWM_LUA = ` +local cur = redis.call('GET', KEYS[1]) +if not cur or tonumber(ARGV[1]) > tonumber(cur) then + redis.call('SET', KEYS[1], ARGV[1]) + return 1 +end +return 0 +`; + +export type SessionIngestResult = + | { kind: 'new'; current: IClickhouseSession } + | { kind: 'extend'; current: IClickhouseSession } + | { kind: 'boundary'; current: IClickhouseSession; closed: IClickhouseSession }; + +function toClickhouseDate(date: Date): string { + return DateTime.fromJSDate(date) + .setZone('UTC') + .toFormat('yyyy-MM-dd HH:mm:ss.SSS'); +} + +function fromClickhouseDate(s: string): Date { + return DateTime.fromFormat(s, 'yyyy-MM-dd HH:mm:ss.SSS', { + zone: 'UTC', + }).toJSDate(); +} + +function pickUtm( + payload: IServiceCreateEventPayload, + key: 'utm_medium' | 'utm_source' | 'utm_campaign' | 'utm_content' | 'utm_term' +): string { + const query = (payload.properties as { __query?: Record } | undefined) + ?.__query; + const v = query?.[key]; + return v ? String(v) : ''; +} + export class SessionBuffer extends BaseBuffer { private batchSize = process.env.SESSION_BUFFER_BATCH_SIZE ? Number.parseInt(process.env.SESSION_BUFFER_BATCH_SIZE, 10) @@ -30,226 +81,275 @@ export class SessionBuffer extends BaseBuffer { this.redis = getRedisCache(); } + /** + * Look up the active session for a device, either directly by deviceId or + * via the profile index (`session:profile:{pid}:{profileId}` → deviceId). + */ public async getExistingSession( options: - | { - sessionId: string; - } - | { - projectId: string; - profileId: string; - } - ) { - let hit: string | null = null; - if ('sessionId' in options) { - hit = await this.redis.get(`session:${options.sessionId}`); - } else { - const value = await this.redis.get( - `session:${options.projectId}:${options.profileId}` + | { projectId: string; deviceId: string } + | { projectId: string; profileId: string } + ): Promise { + if ('deviceId' in options) { + const hit = await this.redis.get( + sessionKey(options.projectId, options.deviceId) ); - if (!value) { - return null; - } - - // Backward compat: old keys stored full JSON, new keys store just the sessionId - if (value.startsWith('{')) { - return getSafeJson(value); - } - - hit = await this.redis.get(`session:${value}`); + return hit ? getSafeJson(hit) : null; } + const deviceId = await this.redis.get( + profileIndexKey(options.projectId, options.profileId) + ); + if (!deviceId) return null; + const hit = await this.redis.get(sessionKey(options.projectId, deviceId)); + return hit ? getSafeJson(hit) : null; + } - if (hit) { - return getSafeJson(hit); + /** + * Ingest one event into the session lifecycle. + * + * Reads the device's current session (if any), decides whether the event + * extends it, opens a brand-new one, or boundaries (gap > 30min) → close + * the old session and start a fresh one. Writes the updated session blob, + * advances HWM, updates sorted sets, and queues CollapsingMergeTree rows + * for ClickHouse. + * + * Returns the action taken so the caller can drive session_start / + * session_end event emission. `session_start` / `session_end` events + * themselves are skipped — they are derived signals, not session activity. + */ + async ingest( + payload: IServiceCreateEventPayload + ): Promise { + if (!payload.projectId || !payload.deviceId) return null; + if (payload.name === 'session_start' || payload.name === 'session_end') { + return null; } - return null; - } - - async getSession( - event: IClickhouseEvent - ): Promise<[IClickhouseSession] | [IClickhouseSession, IClickhouseSession]> { - const existingSession = await this.getExistingSession({ - sessionId: event.session_id, - }); + try { + const existing = await this.getExistingSession({ + projectId: payload.projectId, + deviceId: payload.deviceId, + }); - if (existingSession) { - const oldSession = assocPath(['sign'], -1, clone(existingSession)); - const newSession = assocPath(['sign'], 1, clone(existingSession)); + const eventTimeMs = payload.createdAt.getTime(); + const isBoundary = + existing && + eventTimeMs - fromClickhouseDate(existing.ended_at).getTime() > + SESSION_TIMEOUT_MS; - newSession.version = existingSession.version + 1; + if (existing && !isBoundary) { + const { current, chRows } = this.extendSession(existing, payload); + await this.persist(current, chRows); + return { kind: 'extend', current }; + } - // Events can arrive out of order (client-side batching, retries, offline - // queueing). Treat the session window as [min(event ts), max(event ts)] - // so duration stays non-negative and entry/exit reflect actual order. - const eventTime = new Date(event.created_at).getTime(); - const startTime = new Date(newSession.created_at).getTime(); - const endTime = new Date(newSession.ended_at).getTime(); + const current = this.newSession(payload); + await this.persist(current, [current]); - if (eventTime >= endTime) { - newSession.ended_at = event.created_at; - if (event.path) { - newSession.exit_path = event.path; - } - if (event.origin) { - newSession.exit_origin = event.origin; - } + if (existing && isBoundary) { + return { kind: 'boundary', current, closed: existing }; } + return { kind: 'new', current }; + } catch (error) { + this.logger.error({ err: error }, 'Failed to ingest session'); + return null; + } + } - if (eventTime < startTime) { - newSession.created_at = event.created_at; - if (event.path) { - newSession.entry_path = event.path; - } - if (event.origin) { - newSession.entry_origin = event.origin; - } - } else { - if (!newSession.entry_path && event.path) { - newSession.entry_path = event.path; - } - if (!newSession.entry_origin && event.origin) { - newSession.entry_origin = event.origin; - } - } + /** + * Remove a closed session from Redis. Called by `createSessionEnd` after + * the session_end event has been emitted, or by the reaper when it + * sweeps stale sessions. + */ + async cleanup({ + projectId, + deviceId, + profileId, + }: { + projectId: string; + deviceId: string; + profileId?: string | null; + }): Promise { + const multi = this.redis.multi(); + multi.del(sessionKey(projectId, deviceId)); + multi.zrem(activeSetKey(projectId), deviceId); + multi.zrem(wallclockSetKey(projectId), deviceId); + if (profileId && profileId !== deviceId) { + multi.del(profileIndexKey(projectId, profileId)); + } + await multi.exec(); + } - newSession.duration = - new Date(newSession.ended_at).getTime() - - new Date(newSession.created_at).getTime(); + /** + * Extend an in-flight session with a new event. Returns the updated + * session AND the pair of CollapsingMergeTree rows that ClickHouse needs + * to replace the previous row (sign=-1 cancels, sign=+1 inserts). + */ + private extendSession( + existing: IClickhouseSession, + payload: IServiceCreateEventPayload + ): { + current: IClickhouseSession; + chRows: [IClickhouseSession, IClickhouseSession]; + } { + const oldRow: IClickhouseSession = { ...existing, sign: -1 }; + const current: IClickhouseSession = { + ...existing, + sign: 1, + version: existing.version + 1, + }; + + // Out-of-order safety: treat the session window as [min(event ts), + // max(event ts)]. Late-arriving events (offline mobile flush, retries) + // can't drag ended_at backwards or push duration negative. + const eventTimeMs = payload.createdAt.getTime(); + const startMs = fromClickhouseDate(current.created_at).getTime(); + const endMs = fromClickhouseDate(current.ended_at).getTime(); + const eventCh = toClickhouseDate(payload.createdAt); + + if (eventTimeMs >= endMs) { + current.ended_at = eventCh; + if (payload.path) current.exit_path = payload.path; + if (payload.origin) current.exit_origin = payload.origin; + } - const addedRevenue = event.name === 'revenue' ? (event.revenue ?? 0) : 0; - newSession.revenue = (newSession.revenue ?? 0) + addedRevenue; + if (eventTimeMs < startMs) { + current.created_at = eventCh; + if (payload.path) current.entry_path = payload.path; + if (payload.origin) current.entry_origin = payload.origin; + } else { + if (!current.entry_path && payload.path) current.entry_path = payload.path; + if (!current.entry_origin && payload.origin) + current.entry_origin = payload.origin; + } - if (event.name === 'screen_view' && event.path) { - newSession.screen_view_count += 1; - } else { - newSession.event_count += 1; - } + current.duration = + fromClickhouseDate(current.ended_at).getTime() - + fromClickhouseDate(current.created_at).getTime(); - if (newSession.screen_view_count > 1) { - newSession.is_bounce = false; - } + if (payload.name === 'revenue') { + current.revenue = (current.revenue ?? 0) + (payload.revenue ?? 0); + } - // If the profile_id is set and it's different from the device_id, we need to update the profile_id - if (event.profile_id && event.profile_id !== event.device_id) { - newSession.profile_id = event.profile_id; - } + if (payload.name === 'screen_view' && payload.path) { + current.screen_view_count += 1; + } else { + current.event_count += 1; + } - if (event.groups) { - newSession.groups = [ - ...new Set([...(newSession.groups ?? []), ...event.groups]), - ]; - } + if (current.screen_view_count > 1) { + current.is_bounce = false; + } - return [newSession, oldSession]; + if (payload.profileId && payload.profileId !== payload.deviceId) { + current.profile_id = payload.profileId; } - return [ - { - id: event.session_id, - is_bounce: true, - profile_id: event.profile_id, - project_id: event.project_id, - device_id: event.device_id, - groups: event.groups, - created_at: event.created_at, - ended_at: event.created_at, - event_count: event.name === 'screen_view' ? 0 : 1, - screen_view_count: event.name === 'screen_view' ? 1 : 0, - entry_path: event.path, - entry_origin: event.origin, - exit_path: event.path, - exit_origin: event.origin, - revenue: event.name === 'revenue' ? (event.revenue ?? 0) : 0, - referrer: event.referrer, - referrer_name: event.referrer_name, - referrer_type: event.referrer_type, - os: event.os, - os_version: event.os_version, - browser: event.browser, - browser_version: event.browser_version, - device: event.device, - brand: event.brand, - model: event.model, - country: event.country, - region: event.region, - city: event.city, - longitude: event.longitude ?? null, - latitude: event.latitude ?? null, - duration: event.duration, - utm_medium: event.properties?.['__query.utm_medium'] - ? String(event.properties?.['__query.utm_medium']) - : '', - utm_source: event.properties?.['__query.utm_source'] - ? String(event.properties?.['__query.utm_source']) - : '', - utm_campaign: event.properties?.['__query.utm_campaign'] - ? String(event.properties?.['__query.utm_campaign']) - : '', - utm_content: event.properties?.['__query.utm_content'] - ? String(event.properties?.['__query.utm_content']) - : '', - utm_term: event.properties?.['__query.utm_term'] - ? String(event.properties?.['__query.utm_term']) - : '', - sign: 1, - version: 1, - }, - ]; + if (payload.groups?.length) { + current.groups = [ + ...new Set([...(current.groups ?? []), ...payload.groups]), + ]; + } + + return { current, chRows: [current, oldRow] }; } - protected getRedisListKey(): string { - return this.redisKey; + /** + * Open a brand-new session from this event's payload. Called for the first + * event of a device AND on boundary (gap > 30min closing the old session). + */ + private newSession(payload: IServiceCreateEventPayload): IClickhouseSession { + const createdAt = toClickhouseDate(payload.createdAt); + return { + id: payload.sessionId, + project_id: payload.projectId, + device_id: payload.deviceId, + profile_id: payload.profileId ?? '', + is_bounce: true, + created_at: createdAt, + ended_at: createdAt, + event_count: payload.name === 'screen_view' ? 0 : 1, + screen_view_count: payload.name === 'screen_view' ? 1 : 0, + entry_path: payload.path ?? '', + entry_origin: payload.origin ?? '', + exit_path: payload.path ?? '', + exit_origin: payload.origin ?? '', + revenue: payload.name === 'revenue' ? (payload.revenue ?? 0) : 0, + referrer: payload.referrer ?? '', + referrer_name: payload.referrerName ?? '', + referrer_type: payload.referrerType ?? '', + os: payload.os ?? '', + os_version: payload.osVersion ?? '', + browser: payload.browser ?? '', + browser_version: payload.browserVersion ?? '', + device: payload.device ?? '', + brand: payload.brand ?? '', + model: payload.model ?? '', + country: payload.country ?? '', + region: payload.region ?? '', + city: payload.city ?? '', + longitude: payload.longitude ?? null, + latitude: payload.latitude ?? null, + duration: payload.duration ?? 0, + utm_medium: pickUtm(payload, 'utm_medium'), + utm_source: pickUtm(payload, 'utm_source'), + utm_campaign: pickUtm(payload, 'utm_campaign'), + utm_content: pickUtm(payload, 'utm_content'), + utm_term: pickUtm(payload, 'utm_term'), + groups: payload.groups ?? [], + sign: 1, + version: 1, + }; } - async add(event: IClickhouseEvent) { - if (!event.session_id) { - return; + /** + * Atomic write-back: session blob + active/wallclock ZSETs + projects SET + + * HWM advance + profile index + ClickHouse buffer rows + counter. + */ + private async persist( + current: IClickhouseSession, + chRows: IClickhouseSession[] + ) { + const projectId = current.project_id; + const deviceId = current.device_id; + const eventTimeMs = fromClickhouseDate(current.ended_at).getTime(); + const wallClockMs = Date.now(); + + const multi = this.redis.multi(); + multi.set( + sessionKey(projectId, deviceId), + JSON.stringify(current), + 'EX', + SESSION_BLOB_TTL_SECONDS + ); + multi.zadd(activeSetKey(projectId), eventTimeMs.toString(), deviceId); + multi.zadd(wallclockSetKey(projectId), wallClockMs.toString(), deviceId); + multi.expire(wallclockSetKey(projectId), SESSION_WALLCLOCK_TTL_SECONDS); + multi.sadd(PROJECTS_SET_KEY, projectId); + multi.eval(MONOTONIC_HWM_LUA, 1, hwmKey(projectId), eventTimeMs.toString()); + + if (current.profile_id && current.profile_id !== current.device_id) { + multi.set( + profileIndexKey(projectId, current.profile_id), + deviceId, + 'EX', + SESSION_BLOB_TTL_SECONDS + ); } - if (['session_start', 'session_end'].includes(event.name)) { - return; + for (const row of chRows) { + multi.rpush(this.redisKey, JSON.stringify(row)); + } + multi.llen(this.redisKey); + const result = await multi.exec(); + + // Read ground-truth queue length from the appended LLEN result so we + // avoid an extra round-trip to decide whether to flush. + const llenIndex = (result?.length ?? 1) - 1; + const bufferLength = (result?.[llenIndex]?.[1] as number) ?? 0; + if (bufferLength >= this.batchSize) { + await this.tryFlush(); } - - return this.timeAdd(async () => { - try { - // Plural since we will delete the old session with sign column - const sessions = await this.getSession(event); - const [newSession] = sessions; - - const multi = this.redis.multi(); - multi.set( - `session:${newSession.id}`, - JSON.stringify(newSession), - 'EX', - 60 * 60 - ); - if (newSession.profile_id) { - multi.set( - `session:${newSession.project_id}:${newSession.profile_id}`, - newSession.id, - 'EX', - 60 * 60 - ); - } - for (const session of sessions) { - multi.rpush(this.redisKey, JSON.stringify(session)); - } - // Append LLEN at the end so we can read ground-truth queue length - // from the exec result without an extra round-trip. - multi.llen(this.redisKey); - const result = await multi.exec(); - - const llenIndex = (result?.length ?? 1) - 1; - const bufferLength = (result?.[llenIndex]?.[1] as number) ?? 0; - - if (bufferLength >= this.batchSize) { - await this.tryFlush({ trigger: 'add' }); - } - } catch (error) { - this.logger.error({ err: error }, 'Failed to add session'); - } - }); } /** @@ -349,65 +449,46 @@ export class SessionBuffer extends BaseBuffer { } async processBuffer() { - const lrangeStart = performance.now(); const events = await this.redis.lrange( this.redisKey, 0, this.batchSize - 1 ); - const lrangeMs = performance.now() - lrangeStart; - - if (events.length === 0) { - this.reportFlushStats({ rowsProcessed: 0, phases: { lrangeMs } }); - return; - } - - const parsed: IClickhouseSession[] = []; - for (const raw of events) { - const session = getSafeJson(raw); - if (!session) { - continue; - } - parsed.push({ - ...session, - duration: Math.max(0, session.duration || 0), - }); - } - - // A single high-activity session can produce many (sign=-1, sign=+1) - // pairs within one flush window — each event for that session adds a - // pair. The VersionedCollapsingMergeTree on `sessions` will collapse - // them at merge time, so the final state is correct either way. But - // inserting all the intermediate rows costs network bytes, gzip CPU, - // CH ingest work, and CH merge work. - // - // We can squash within the batch and only insert the boundary rows: - // the oldest sign=-1 (which cancels the previous CH state) and the - // newest sign=+1 (the final state). Same end result, fewer rows. + if (events.length === 0) return; + + const parsed = events + .map((e) => getSafeJson(e)) + .filter((s): s is IClickhouseSession => s !== null) + .map((s) => ({ ...s, duration: Math.max(0, s.duration || 0) })); + + // A single high-activity session produces many (sign=-1, sign=+1) + // pairs within one flush window — every extend appends a pair. The + // VersionedCollapsingMergeTree collapses them at merge time, so the + // final state is correct either way, but inserting all the + // intermediate rows costs network bytes, gzip CPU, and CH ingest + + // merge work. Squash per-version to insert only the boundary rows. // // Set SESSION_BUFFER_SQUASH=false to disable as a safety hatch. const sessions = this.squashEnabled ? this.squashSessionsByVersion(parsed) : parsed; - const chStart = performance.now(); - await this.parallelLimit(this.chunks(sessions, this.chunkSize), (chunk) => - ch.insert({ + // Insert before trimming so a failed CH insert leaves the queue intact — + // errors propagate to tryFlush, which resyncs and retries. Never swallow. + for (const chunk of this.chunks(sessions, this.chunkSize)) { + await ch.insert({ table: TABLE_NAMES.sessions, values: chunk, format: 'JSONEachRow', - clickhouse_settings: this.getClickhouseSettings(), - }) - ); - const chInsertMs = performance.now() - chStart; + }); + } - const trimStart = performance.now(); await this.redis.ltrim(this.redisKey, events.length, -1); - const trimMs = performance.now() - trimStart; - this.reportFlushStats({ - rowsProcessed: events.length, - phases: { lrangeMs, chInsertMs, trimMs }, - }); + this.logger.debug({ count: events.length }, 'Processed sessions'); + } + + protected getRedisListKey(): string { + return this.redisKey; } } diff --git a/packages/db/src/services/event.service.ts b/packages/db/src/services/event.service.ts index b31c5bafa..981c17f1d 100644 --- a/packages/db/src/services/event.service.ts +++ b/packages/db/src/services/event.service.ts @@ -4,7 +4,7 @@ import type { IChartEventFilter } from '@openpanel/validation'; import { assocPath, last, mergeDeepRight, path, uniq } from 'ramda'; import sqlstring from 'sqlstring'; import { v4 as uuid } from 'uuid'; -import { botBuffer, eventBuffer, sessionBuffer } from '../buffers'; +import { botBuffer, eventBuffer } from '../buffers'; import { ch, chQuery, @@ -355,6 +355,16 @@ export async function getEvents( return events.map(transformEvent); } +/** + * Persist an event to ClickHouse (via the buffer) and upsert the profile + * on session boundaries. + * + * Does NOT touch the session-row buffer. Callers producing non-session_start + * / session_end events are responsible for calling `sessionBuffer.ingest()` + * before this. `incoming-event.ts` is the only such caller today; everywhere + * else (session_start, session_end) the session-row update is correctly a + * no-op anyway. + */ export async function createEvent(payload: IServiceCreateEventPayload) { if (!payload.profileId && payload.deviceId) { payload.profileId = payload.deviceId; @@ -396,7 +406,9 @@ export async function createEvent(payload: IServiceCreateEventPayload) { groups: payload.groups ?? [], }; - const promises = [sessionBuffer.add(event), eventBuffer.add(event)]; + eventBuffer.add(event); + + const promises: Promise[] = []; if (payload.profileId) { const profile: IServiceUpsertProfile = { diff --git a/packages/db/src/services/session.service.ts b/packages/db/src/services/session.service.ts index 4c71c3f12..7a3321e47 100644 --- a/packages/db/src/services/session.service.ts +++ b/packages/db/src/services/session.service.ts @@ -50,9 +50,6 @@ export interface IClickhouseSession { utm_content: string; utm_term: string; revenue: number; - // CollapsingMergeTree marker: +1 = current state, -1 = cancel a prior +1. - // The session-buffer emits both +1 (new) and -1 (old) rows per update so - // CH can collapse intermediate states. sign: 1 | -1; version: number; // Dynamically added @@ -213,7 +210,7 @@ export async function getSessionList(options: GetSessionListOptions) { groupsExpr: 'groups', startDate, endDate, - }), + }) ); } @@ -338,7 +335,7 @@ export async function getSessionsCount({ groupsExpr: 'groups', startDate, endDate, - }), + }) ); } @@ -481,7 +478,7 @@ export interface QuerySessionsInput { } export async function querySessionsCore( - input: QuerySessionsInput, + input: QuerySessionsInput ): Promise { const builder = clix(ch) .select([]) @@ -525,7 +522,10 @@ export async function querySessionsCore( builder.where('browser', '=', input.browser); } - const { startDate: start, endDate: end } = resolveDateRange(input.startDate, input.endDate); + const { startDate: start, endDate: end } = resolveDateRange( + input.startDate, + input.endDate + ); builder.where('created_at', 'BETWEEN', [ clix.datetime(start), diff --git a/packages/queue/src/queues.ts b/packages/queue/src/queues.ts index 88398b9e7..2a78c0a3e 100644 --- a/packages/queue/src/queues.ts +++ b/packages/queue/src/queues.ts @@ -138,6 +138,10 @@ export type CronQueuePayloadCohortRefresh = { type: 'cohortRefresh'; payload: undefined; }; +export type CronQueuePayloadSessionReaper = { + type: 'sessionReaper'; + payload: undefined; +}; export type CronQueuePayload = | CronQueuePayloadSalt | CronQueuePayloadFlushEvents @@ -151,7 +155,8 @@ export type CronQueuePayload = | CronQueuePayloadInsightsDaily | CronQueuePayloadOnboarding | CronQueuePayloadGscSync - | CronQueuePayloadCohortRefresh; + | CronQueuePayloadCohortRefresh + | CronQueuePayloadSessionReaper; export type MiscQueuePayloadTrialEndingSoon = { type: 'trialEndingSoon'; From e763b95c10a9ec990954a70e68d549cb40354fd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Tue, 19 May 2026 22:39:07 +0200 Subject: [PATCH 2/5] wip --- apps/worker/src/boot-cron.ts | 5 + apps/worker/src/jobs/cron.session-reaper.ts | 99 ++++++++--------- apps/worker/src/jobs/cron.session-vacuum.ts | 101 ++++++++++++++++++ apps/worker/src/jobs/cron.ts | 21 +++- .../src/jobs/events.create-session-end.ts | 40 +++++-- .../src/jobs/events.incoming-events.test.ts | 5 +- apps/worker/src/metrics.ts | 16 ++- apps/worker/src/utils/session-handler.ts | 21 ++-- packages/db/scripts/migrate-sessions.ts | 42 +------- packages/db/src/buffers/index.ts | 1 - .../db/src/buffers/session-buffer.test.ts | 72 +++++++------ packages/db/src/buffers/session-buffer.ts | 96 +++++++++-------- packages/queue/src/queues.ts | 13 ++- 13 files changed, 332 insertions(+), 200 deletions(-) create mode 100644 apps/worker/src/jobs/cron.session-vacuum.ts diff --git a/apps/worker/src/boot-cron.ts b/apps/worker/src/boot-cron.ts index fc9587386..4150308dc 100644 --- a/apps/worker/src/boot-cron.ts +++ b/apps/worker/src/boot-cron.ts @@ -98,6 +98,11 @@ export async function bootCron() { type: 'sessionReaper', pattern: 1000 * 60 * 5, // every 5 minutes }, + { + name: 'sessionVacuum', + type: 'sessionVacuum', + pattern: '0 4 * * *', // daily at 04:00 UTC — backstop for cleanup leaks + }, ]; if (process.env.SELF_HOSTED && process.env.NODE_ENV === 'production') { diff --git a/apps/worker/src/jobs/cron.session-reaper.ts b/apps/worker/src/jobs/cron.session-reaper.ts index 2cf9c4945..867b07db9 100644 --- a/apps/worker/src/jobs/cron.session-reaper.ts +++ b/apps/worker/src/jobs/cron.session-reaper.ts @@ -1,7 +1,11 @@ import { SESSION_TIMEOUT_MS, sessionBuffer } from '@openpanel/db'; import { getRedisCache } from '@openpanel/redis'; import { logger as baseLogger } from '@/utils/logger'; -import { sessionEndsEnqueued, sessionsReaped } from '@/metrics'; +import { + sessionEndsEnqueued, + sessionsReaped, + sessionsReaperOrphans, +} from '@/metrics'; import { enqueueSessionEndV2 } from '@/utils/session-handler'; const logger = baseLogger.child({ job: 'session-reaper' }); @@ -11,28 +15,31 @@ const REAPER_BATCH_SIZE = Number.parseInt( process.env.SESSION_REAPER_BATCH_SIZE || '5000', 10 ); -// 24h deadman: close sessions whose host project has gone silent in event-time -// AND whose last wall-clock-received is older than this. Bounds Redis memory -// for projects that fall idle. +// Wall-clock deadman: close sessions whose last *received* event is older +// than this. Single source of truth for "this session has ended", regardless +// of project traffic. +// +// Default 30min, matching SESSION_TIMEOUT_MS. With the 5-min reaper interval, +// max session_end latency is 30-35min after the last event. +// +// During a worker pause longer than this, sessions in flight will be split +// when the worker resumes. Bump via env if you expect long pauses, but +// remember that BullMQ jobId-dedup + the extension check in createSessionEnd +// already cover normal queue lag. const WALLCLOCK_DEADMAN_MS = Number.parseInt( process.env.SESSION_REAPER_WALLCLOCK_DEADMAN_MS || - String(1000 * 60 * 60 * 24), + String(SESSION_TIMEOUT_MS), 10 ); const LOCK_TTL_SECONDS = 60; -const activeSetKey = (projectId: string) => `session:active:${projectId}`; const wallclockSetKey = (projectId: string) => `session:wallclock:${projectId}`; -const sessionKey = (projectId: string, deviceId: string) => - `session:${projectId}:${deviceId}`; -const hwmKey = (projectId: string) => `session:hwm:${projectId}`; const lockKey = (projectId: string) => `session:reaper:lock:${projectId}`; /** * Cron: scan every project with active sessions, close any whose last event - * is older than the timeout in event-time (HWM-based) or older than the - * 24h wall-clock deadman. + * was received more than the deadman ago in wall-clock time. * * Idempotent — uses BullMQ jobId-dedup via enqueueSessionEndV2. */ @@ -89,52 +96,28 @@ async function reapProject(projectId: string): Promise { } try { - const hwmRaw = await redis.get(hwmKey(projectId)); const now = Date.now(); - let reaped = 0; - - // 1. Event-time reap: close sessions whose last event is more than - // SESSION_TIMEOUT_MS behind the project's high-water mark. - if (hwmRaw) { - const hwm = Number.parseInt(hwmRaw, 10); - if (Number.isFinite(hwm)) { - const threshold = hwm - SESSION_TIMEOUT_MS; - const candidates = await redis.zrangebyscore( - activeSetKey(projectId), - 0, - threshold, - 'LIMIT', - 0, - REAPER_BATCH_SIZE - ); - for (const deviceId of candidates) { - if (await closeSession(projectId, deviceId, 'event-time')) { - reaped++; - } - } - } - } + const cutoff = now - WALLCLOCK_DEADMAN_MS; - // 2. Wall-clock deadman: catches sessions in projects whose HWM never - // advances (low/zero traffic). Bounds memory. - const deadmanCutoff = now - WALLCLOCK_DEADMAN_MS; - const stragglers = await redis.zrangebyscore( + const candidates = await redis.zrangebyscore( wallclockSetKey(projectId), 0, - deadmanCutoff, + cutoff, 'LIMIT', 0, REAPER_BATCH_SIZE ); - for (const deviceId of stragglers) { - if (await closeSession(projectId, deviceId, 'deadman')) { + + let reaped = 0; + for (const deviceId of candidates) { + if (await closeSession(projectId, deviceId)) { reaped++; } } - // 3. House-keeping: if the project has no active sessions anymore, - // remove it from the projects set so the reaper stops iterating it. - const remaining = await redis.zcard(activeSetKey(projectId)); + // If the project has no remaining sessions in the wallclock index, + // remove it from the projects set so we stop iterating it. + const remaining = await redis.zcard(wallclockSetKey(projectId)); if (remaining === 0) { await redis.srem(PROJECTS_SET_KEY, projectId); } @@ -147,8 +130,7 @@ async function reapProject(projectId: string): Promise { async function closeSession( projectId: string, - deviceId: string, - reason: 'event-time' | 'deadman' + deviceId: string ): Promise { const session = await sessionBuffer.getExistingSession({ projectId, @@ -156,13 +138,16 @@ async function closeSession( }); if (!session) { - // Already cleaned up (e.g., session_end ran from another path). Drop - // from the sorted sets to keep them tidy. - const redis = getRedisCache(); - const multi = redis.multi(); - multi.zrem(activeSetKey(projectId), deviceId); - multi.zrem(wallclockSetKey(projectId), deviceId); - await multi.exec(); + // Sorted-set entry exists but the blob is gone. Without a TTL on the + // blob this should be rare — only happens if cleanup() partially + // failed (worker crash mid-MULTI, network partition). Log + count it + // and ZREM the orphan entry. + sessionsReaperOrphans.inc({ reason: 'deadman' }); + baseLogger.warn( + { projectId, deviceId }, + 'Reaper found wallclock entry without blob — likely a partial cleanup failure', + ); + await getRedisCache().zrem(wallclockSetKey(projectId), deviceId); return false; } @@ -201,17 +186,17 @@ async function closeSession( closedSession: session, }); - sessionsReaped.inc({ reason }); + sessionsReaped.inc({ reason: 'deadman' }); sessionEndsEnqueued.inc({ source: 'reaper' }); logger.debug( - { sessionId: session.id, projectId, deviceId, reason }, + { sessionId: session.id, projectId, deviceId }, 'Enqueued session_end (reaped)', ); return true; } catch (error) { logger.error( - { err: error, sessionId: session.id, projectId, deviceId, reason }, + { err: error, sessionId: session.id, projectId, deviceId }, 'Failed to enqueue session_end during reap', ); return false; diff --git a/apps/worker/src/jobs/cron.session-vacuum.ts b/apps/worker/src/jobs/cron.session-vacuum.ts new file mode 100644 index 000000000..ea1911395 --- /dev/null +++ b/apps/worker/src/jobs/cron.session-vacuum.ts @@ -0,0 +1,101 @@ +import { sessionBuffer } from '@openpanel/db'; +import { getRedisCache } from '@openpanel/redis'; +import { logger as baseLogger } from '@/utils/logger'; +import { sessionsVacuumed } from '@/metrics'; + +const logger = baseLogger.child({ job: 'session-vacuum' }); + +const PROJECTS_SET_KEY = 'session:projects'; +const VACUUM_BATCH_SIZE = Number.parseInt( + process.env.SESSION_VACUUM_BATCH_SIZE || '1000', + 10 +); +// Anything in the wallclock index that hasn't moved in this long is stale — +// either a leaked entry (cleanup() partially failed) or a session that's +// been forgotten by every code path. Much larger than the reaper deadman +// so it never races with normal reap timing. +const VACUUM_STALE_THRESHOLD_MS = Number.parseInt( + process.env.SESSION_VACUUM_STALE_THRESHOLD_MS || + String(1000 * 60 * 60 * 24 * 7), // 7 days + 10 +); + +const wallclockSetKey = (projectId: string) => + `session:wallclock:${projectId}`; + +/** + * Daily backstop for the rare case where `cleanup()` fails to fully delete + * a session (worker crash mid-MULTI, Redis partition). Scans each project's + * wallclock index for entries older than the stale threshold and: + * + * - If the blob still exists → run cleanup() (id-gated, atomic via Lua) + * - If the blob is gone → ZREM the orphan entry + * + * Both paths increment `sessions_vacuumed_total` with a reason label, so + * persistent non-zero values surface deeper issues. + */ +export async function sessionVacuumCronJob() { + if (process.env.SESSION_VACUUM === '0') { + return; + } + + const redis = getRedisCache(); + const projectIds = await redis.smembers(PROJECTS_SET_KEY); + + if (projectIds.length === 0) { + return; + } + + logger.info({ projectCount: projectIds.length }, 'Vacuum tick starting'); + + const cutoff = Date.now() - VACUUM_STALE_THRESHOLD_MS; + let total = 0; + let staleBlobs = 0; + let missingBlobs = 0; + + for (const projectId of projectIds) { + try { + const candidates = await redis.zrangebyscore( + wallclockSetKey(projectId), + 0, + cutoff, + 'LIMIT', + 0, + VACUUM_BATCH_SIZE + ); + + for (const deviceId of candidates) { + const session = await sessionBuffer.getExistingSession({ + projectId, + deviceId, + }); + + if (session) { + // Blob still exists but is ancient — cleanup leaked. Use the + // id-gated cleanup so we don't race with a fresh session that + // happens to occupy the same slot. + await sessionBuffer.cleanup({ + projectId, + deviceId, + sessionId: session.id, + profileId: session.profile_id, + }); + sessionsVacuumed.inc({ reason: 'stale_blob' }); + staleBlobs++; + } else { + // Orphan: blob is gone, wallclock entry left behind. + await redis.zrem(wallclockSetKey(projectId), deviceId); + sessionsVacuumed.inc({ reason: 'missing_blob' }); + missingBlobs++; + } + total++; + } + } catch (error) { + logger.error({ err: error, projectId }, 'Vacuum failed for project'); + } + } + + if (total > 0) { + logger.info({ total, staleBlobs, missingBlobs }, 'Vacuum tick complete'); + } +} diff --git a/apps/worker/src/jobs/cron.ts b/apps/worker/src/jobs/cron.ts index 214ce0bdf..b00a6ddc0 100644 --- a/apps/worker/src/jobs/cron.ts +++ b/apps/worker/src/jobs/cron.ts @@ -1,18 +1,26 @@ -import type { Job } from 'bullmq'; - -import { eventBuffer, groupBuffer, profileBackfillBuffer, profileBuffer, replayBuffer, sessionBuffer } from '@openpanel/db'; +import { + eventBuffer, + groupBuffer, + profileBackfillBuffer, + profileBuffer, + replayBuffer, + sessionBuffer, +} from '@openpanel/db'; import type { CronQueuePayload } from '@openpanel/queue'; - +import type { Job } from 'bullmq'; import { cohortRefreshCronJob } from './cron.cohort-refresh'; import { jobDelete } from './cron.delete'; -import { gscSyncAllJob } from './gsc'; import { onboardingJob } from './cron.onboarding'; import { ping } from './cron.ping'; import { salt } from './cron.salt'; import { sessionReaperCronJob } from './cron.session-reaper'; +import { sessionVacuumCronJob } from './cron.session-vacuum'; +import { gscSyncAllJob } from './gsc'; import { insightsDailyJob } from './insights'; +import { logger } from '@/utils/logger'; export async function cronJob(job: Job) { + logger.info(`Cron job started - ${job.data.type}`); switch (job.data.type) { case 'salt': { return await salt(); @@ -56,5 +64,8 @@ export async function cronJob(job: Job) { case 'sessionReaper': { return await sessionReaperCronJob(); } + case 'sessionVacuum': { + return await sessionVacuumCronJob(); + } } } diff --git a/apps/worker/src/jobs/events.create-session-end.ts b/apps/worker/src/jobs/events.create-session-end.ts index 972da066e..460a40d76 100644 --- a/apps/worker/src/jobs/events.create-session-end.ts +++ b/apps/worker/src/jobs/events.create-session-end.ts @@ -59,7 +59,7 @@ async function getSessionEvents({ export async function createSessionEnd( job: Job ) { - const { payload } = job.data; + const { payload, snapshot } = job.data; const logger = baseLogger.child({ payload, jobId: job.id, @@ -67,19 +67,41 @@ export async function createSessionEnd( logger.debug('Processing session end job'); - const session = await sessionBuffer.getExistingSession({ + // Prefer the live blob — it reflects any late extensions that arrived after + // enqueue. Fall back to the snapshot if the blob expired (very long queue + // lag, retries hours later, etc.). + const live = await sessionBuffer.getExistingSession({ projectId: payload.projectId, deviceId: payload.deviceId, }); + // Distinguish three cases: + // - Same session, extended after enqueue → bail. Reaper will retry later. + // - Same session, no change → use live (snapshot equivalent). + // - Different session in slot (boundary) → use snapshot for emission; + // cleanup will be a no-op since the slot is owned by the new session. + // - No live blob at all (TTL expired) → use snapshot. + const sameSession = live && live.id === snapshot.id; + if (sameSession && live.ended_at > snapshot.ended_at) { + sessionEndsSkipped.inc({ reason: 'extended_after_enqueue' }); + logger.info( + { + sessionId: live.id, + projectId: live.project_id, + snapshotEndedAt: snapshot.ended_at, + liveEndedAt: live.ended_at, + }, + 'session was extended after close was enqueued, skipping', + ); + return null; + } + + const session = sameSession ? live : snapshot; if (!session) { - // Session already cleaned up — likely already ended by a sibling job or - // reaped after Redis TTL. Treat as a no-op rather than a hard error so - // retries don't spam the queue. sessionEndsSkipped.inc({ reason: 'not_found' }); logger.warn( { projectId: payload.projectId, deviceId: payload.deviceId }, - 'Session not found in Redis — skipping session_end', + 'No live session and no snapshot — skipping session_end', ); return null; } @@ -156,12 +178,14 @@ export async function createSessionEnd( ); } - // Clean up Redis state for this session. The session blob, profile index, - // and active/wallclock sorted set entries are all removed atomically. + // Clean up Redis state for this session. cleanup() is id-gated — if a new + // session has already taken over this (projectId, deviceId) slot via a + // boundary, the call is a no-op and the new session is preserved. await sessionBuffer .cleanup({ projectId: session.project_id, deviceId: session.device_id, + sessionId: session.id, profileId: session.profile_id, }) .catch((error) => { diff --git a/apps/worker/src/jobs/events.incoming-events.test.ts b/apps/worker/src/jobs/events.incoming-events.test.ts index d1a6e7af9..a9edb9dc7 100644 --- a/apps/worker/src/jobs/events.incoming-events.test.ts +++ b/apps/worker/src/jobs/events.incoming-events.test.ts @@ -188,9 +188,8 @@ describe('incomingEvent', () => { const [, payload, opts] = spy.mock.calls[0]!; expect((payload as any).type).toBe('createSessionEnd'); expect((payload as any).payload.sessionId).toBe('old-session-id'); - expect(opts?.jobId).toBe( - `sessionEnd:v2:${projectId}:${deviceId}:old-session-id` - ); + expect((payload as any).snapshot.id).toBe('old-session-id'); + expect(opts?.jobId).toBe('sessionEnd:v2:old-session-id'); const calls = (createEvent as Mock).mock.calls; expect(calls.filter(([a]) => a?.name === 'session_start')).toHaveLength(1); diff --git a/apps/worker/src/metrics.ts b/apps/worker/src/metrics.ts index 7506f428a..d96308e2d 100644 --- a/apps/worker/src/metrics.ts +++ b/apps/worker/src/metrics.ts @@ -344,6 +344,13 @@ export const sessionsReaped = new client.Counter({ }); register.registerMetric(sessionsReaped); +export const sessionsReaperOrphans = new client.Counter({ + name: 'sessions_reaper_orphans_total', + help: 'Reaper found a sorted-set entry whose session blob is missing. Non-zero usually means TTL mismatch.', + labelNames: ['reason'], // 'event-time' | 'deadman' +}); +register.registerMetric(sessionsReaperOrphans); + export const sessionDurationOnClose = new client.Histogram({ name: 'session_duration_ms_on_close', help: 'Duration of closed sessions (ms)', @@ -383,7 +390,7 @@ register.registerMetric( } const multi = redis.multi(); for (const pid of projectIds) { - multi.zcard(`session:active:${pid}`); + multi.zcard(`session:wallclock:${pid}`); } const results = await multi.exec(); let total = 0; @@ -441,3 +448,10 @@ register.registerMetric( }, }) ); + +export const sessionsVacuumed = new client.Counter({ + name: 'sessions_vacuumed_total', + help: 'Sessions removed by the daily vacuum cron (catches blobs that cleanup() missed)', + labelNames: ['reason'], // 'stale_blob' | 'missing_blob' +}); +register.registerMetric(sessionsVacuumed); diff --git a/apps/worker/src/utils/session-handler.ts b/apps/worker/src/utils/session-handler.ts index aea312709..ecf43dd9f 100644 --- a/apps/worker/src/utils/session-handler.ts +++ b/apps/worker/src/utils/session-handler.ts @@ -11,15 +11,17 @@ export const SESSION_TIMEOUT = SESSION_TIMEOUT_MS; /** * Deterministic v2 jobId for a closed session. * - * Includes the session's stable `id` so concurrent / retried closes for the + * Keyed on the session's stable `id` so concurrent / retried closes for the * same logical session dedupe in BullMQ. The `v2:` prefix lets the legacy * drain script differentiate from pre-migration delayed jobs. + * + * Format constraint: BullMQ only accepts ':' in custom jobIds when splitting + * by ':' yields exactly 3 parts, so we keep the suffix as a single segment. + * Cross-project sessionInternalId collisions would be astronomical given the + * 128-bit hash used to generate them. */ -export const getSessionEndJobIdV2 = ( - projectId: string, - deviceId: string, - sessionInternalId: string -) => `sessionEnd:v2:${projectId}:${deviceId}:${sessionInternalId}`; +export const getSessionEndJobIdV2 = (sessionInternalId: string) => + `sessionEnd:v2:${sessionInternalId}`; /** * Enqueue a session_end job. Idempotent via jobId. @@ -34,11 +36,7 @@ export async function enqueueSessionEndV2({ payload: IServiceCreateEventPayload; closedSession: IClickhouseSession; }) { - const jobId = getSessionEndJobIdV2( - closedSession.project_id, - closedSession.device_id, - closedSession.id - ); + const jobId = getSessionEndJobIdV2(closedSession.id); return sessionsQueue.add( 'session', @@ -51,6 +49,7 @@ export async function enqueueSessionEndV2({ sessionId: closedSession.id, profileId: closedSession.profile_id || payload.profileId, }, + snapshot: closedSession, }, { jobId, diff --git a/packages/db/scripts/migrate-sessions.ts b/packages/db/scripts/migrate-sessions.ts index 64fadbb70..ca991a023 100644 --- a/packages/db/scripts/migrate-sessions.ts +++ b/packages/db/scripts/migrate-sessions.ts @@ -2,11 +2,9 @@ // delayed BullMQ `sessionEnd:{projectId}:{deviceId}` jobs into the new // session-buffer key scheme: // -// session:{projectId}:{deviceId} -> full session JSON blob -// session:profile:{projectId}:{profileId} -> deviceId pointer -// session:active:{projectId} -> ZSET scored by last event_time (ms) +// session:{projectId}:{deviceId} -> full session JSON blob (no TTL) +// session:profile:{projectId}:{profileId} -> deviceId pointer (no TTL) // session:wallclock:{projectId} -> ZSET scored by wall-clock (ms) -// session:hwm:{projectId} -> monotonic event-time high-water mark // session:projects -> SET of project_ids with active sessions // // Source of truth: the delayed `sessionsQueue` jobs — one per (projectId, @@ -34,26 +32,12 @@ import { import { getRedisCache } from '@openpanel/redis'; import type { IClickhouseSession } from '../src/services/session.service'; -const SESSION_BLOB_TTL_SECONDS = 60 * 60; -const SESSION_WALLCLOCK_TTL_SECONDS = 60 * 60 * 25; - -const MONOTONIC_HWM_LUA = ` -local cur = redis.call('GET', KEYS[1]) -if not cur or tonumber(ARGV[1]) > tonumber(cur) then - redis.call('SET', KEYS[1], ARGV[1]) - return 1 -end -return 0 -`; - const sessionKey = (projectId: string, deviceId: string) => `session:${projectId}:${deviceId}`; const profileIndexKey = (projectId: string, profileId: string) => `session:profile:${projectId}:${profileId}`; -const activeSetKey = (projectId: string) => `session:active:${projectId}`; const wallclockSetKey = (projectId: string) => `session:wallclock:${projectId}`; -const hwmKey = (projectId: string) => `session:hwm:${projectId}`; const PROJECTS_SET_KEY = 'session:projects'; async function main() { @@ -64,8 +48,6 @@ async function main() { `[migrate-sessions] starting${dryRun ? ' (DRY RUN — no writes)' : ''}` ); - // Pull every delayed + waiting `sessionEnd` job from the legacy queue. - // These represent all currently-active sessions in the old system. const jobs = await sessionsQueue.getJobs(['delayed', 'waiting']); console.log(`[migrate-sessions] found ${jobs.length} legacy session jobs`); @@ -85,14 +67,12 @@ async function main() { } const { projectId, deviceId, sessionId } = payload; - // Skip if already migrated (re-runnable). const existsAlready = await redis.exists(sessionKey(projectId, deviceId)); if (existsAlready) { counts.alreadyMigrated++; continue; } - // Legacy: blob lived at session:{sessionId}. const blob = await redis.get(`session:${sessionId}`); if (!blob) { counts.blobMissing++; @@ -105,7 +85,6 @@ async function main() { continue; } - const lastEventMs = new Date(session.ended_at).getTime(); const nowMs = Date.now(); if (dryRun) { @@ -114,24 +93,11 @@ async function main() { } const multi = redis.multi(); - multi.set( - sessionKey(projectId, deviceId), - blob, - 'EX', - SESSION_BLOB_TTL_SECONDS - ); - multi.zadd(activeSetKey(projectId), lastEventMs.toString(), deviceId); + multi.set(sessionKey(projectId, deviceId), blob); multi.zadd(wallclockSetKey(projectId), nowMs.toString(), deviceId); - multi.expire(wallclockSetKey(projectId), SESSION_WALLCLOCK_TTL_SECONDS); multi.sadd(PROJECTS_SET_KEY, projectId); - multi.eval(MONOTONIC_HWM_LUA, 1, hwmKey(projectId), lastEventMs.toString()); if (session.profile_id && session.profile_id !== deviceId) { - multi.set( - profileIndexKey(projectId, session.profile_id), - deviceId, - 'EX', - SESSION_BLOB_TTL_SECONDS - ); + multi.set(profileIndexKey(projectId, session.profile_id), deviceId); } await multi.exec(); diff --git a/packages/db/src/buffers/index.ts b/packages/db/src/buffers/index.ts index 5f33fe8d5..46b5920bc 100644 --- a/packages/db/src/buffers/index.ts +++ b/packages/db/src/buffers/index.ts @@ -18,6 +18,5 @@ export type { ProfileBackfillEntry } from './profile-backfill-buffer'; export type { IClickhouseSessionReplayChunk } from './replay-buffer'; export { SESSION_TIMEOUT_MS, - SESSION_BLOB_TTL_SECONDS, type SessionIngestResult, } from './session-buffer'; diff --git a/packages/db/src/buffers/session-buffer.test.ts b/packages/db/src/buffers/session-buffer.test.ts index 215912896..282c84f99 100644 --- a/packages/db/src/buffers/session-buffer.test.ts +++ b/packages/db/src/buffers/session-buffer.test.ts @@ -93,46 +93,26 @@ describe('SessionBuffer', () => { expect(parsed.device_id).toBe(deviceId); }); - it('registers the device in active+wallclock sorted sets and the projects set', async () => { + it('registers the device in the wallclock sorted set and the projects set', async () => { await sessionBuffer.ingest(makePayload()); - expect( - await redis.zscore(`session:active:${projectId}`, deviceId) - ).not.toBeNull(); expect( await redis.zscore(`session:wallclock:${projectId}`, deviceId) ).not.toBeNull(); expect(await redis.sismember('session:projects', projectId)).toBe(1); }); - it('advances session:hwm:{pid} monotonically — never regresses', async () => { - const t0 = new Date('2026-01-01T00:00:00.000Z'); - await sessionBuffer.ingest(makePayload({ createdAt: t0 })); - expect(Number(await redis.get(`session:hwm:${projectId}`))).toBe( - t0.getTime() - ); - - // A later event in event-time advances HWM. + it('writes no TTL on the session blob or profile pointer', async () => { await sessionBuffer.ingest( - makePayload({ - createdAt: new Date(t0.getTime() + 5000), - deviceId: 'device-2', - }) - ); - expect(Number(await redis.get(`session:hwm:${projectId}`))).toBe( - t0.getTime() + 5000 + makePayload({ profileId: 'profile-X', deviceId: 'device-Y' }) ); - // An older event does NOT regress HWM. - await sessionBuffer.ingest( - makePayload({ - createdAt: new Date(t0.getTime() - 5000), - deviceId: 'device-3', - }) - ); - expect(Number(await redis.get(`session:hwm:${projectId}`))).toBe( - t0.getTime() + 5000 - ); + // Blob and profile pointer must survive arbitrarily long reaper outages, + // so neither carries a Redis TTL. + expect(await redis.ttl(`session:${projectId}:device-Y`)).toBe(-1); + expect( + await redis.ttl(`session:profile:${projectId}:profile-X`) + ).toBe(-1); }); it('writes profile→device pointer when profile_id differs from device_id', async () => { @@ -220,7 +200,7 @@ describe('SessionBuffer', () => { expect(blob.duration).toBeGreaterThanOrEqual(0); }); - it('cleanup() removes blob, sorted set entries, and profile pointer', async () => { + it('cleanup() removes blob, wallclock entry, and profile pointer', async () => { await sessionBuffer.ingest( makePayload({ profileId: 'profile-X', deviceId: 'device-Y' }) ); @@ -228,13 +208,11 @@ describe('SessionBuffer', () => { await sessionBuffer.cleanup({ projectId, deviceId: 'device-Y', + sessionId, profileId: 'profile-X', }); expect(await redis.get(`session:${projectId}:device-Y`)).toBeNull(); - expect( - await redis.zscore(`session:active:${projectId}`, 'device-Y') - ).toBeNull(); expect( await redis.zscore(`session:wallclock:${projectId}`, 'device-Y') ).toBeNull(); @@ -243,6 +221,34 @@ describe('SessionBuffer', () => { ).toBeNull(); }); + it('cleanup() is a no-op when slot is owned by a different session (boundary)', async () => { + // Open session S1 + await sessionBuffer.ingest(makePayload({ sessionId: 's1' })); + // 35min later — boundary, opens S2 at same (pid, did) + await sessionBuffer.ingest( + makePayload({ + sessionId: 's2', + createdAt: new Date(Date.now() + 35 * 60 * 1000), + }) + ); + + // Try to cleanup S1 — should not touch S2's data. The Lua script in + // cleanup() detects the id mismatch and returns early. + await sessionBuffer.cleanup({ + projectId, + deviceId, + sessionId: 's1', + }); + + const blob = await redis.get(`session:${projectId}:${deviceId}`); + expect(blob).not.toBeNull(); + const parsed = JSON.parse(blob ?? ''); + expect(parsed.id).toBe('s2'); + expect( + await redis.zscore(`session:wallclock:${projectId}`, deviceId) + ).not.toBeNull(); + }); + it('processes buffer and inserts sessions into ClickHouse', async () => { await sessionBuffer.ingest(makePayload()); diff --git a/packages/db/src/buffers/session-buffer.ts b/packages/db/src/buffers/session-buffer.ts index c6652a32d..3eeea0b33 100644 --- a/packages/db/src/buffers/session-buffer.ts +++ b/packages/db/src/buffers/session-buffer.ts @@ -6,28 +6,35 @@ import type { IServiceCreateEventPayload } from '../services/event.service'; import type { IClickhouseSession } from '../services/session.service'; import { BaseBuffer } from './base-buffer'; -export const SESSION_TIMEOUT_MS = 1000 * 60 * 30; // 30 minutes -export const SESSION_BLOB_TTL_SECONDS = 60 * 60; // 1 hour -export const SESSION_WALLCLOCK_TTL_SECONDS = 60 * 60 * 25; // 25h, > 24h deadman +// 30min of idle in event-time → session ends. Matches industry default. +export const SESSION_TIMEOUT_MS = 1000 * 60 * 30; const sessionKey = (projectId: string, deviceId: string) => `session:${projectId}:${deviceId}`; const profileIndexKey = (projectId: string, profileId: string) => `session:profile:${projectId}:${profileId}`; -const activeSetKey = (projectId: string) => `session:active:${projectId}`; const wallclockSetKey = (projectId: string) => `session:wallclock:${projectId}`; -const hwmKey = (projectId: string) => `session:hwm:${projectId}`; const PROJECTS_SET_KEY = 'session:projects'; -// Monotonic-set Lua: only update if new value is greater than current. -const MONOTONIC_HWM_LUA = ` +// Atomic id-gated cleanup. Only deletes the blob + sorted-set entry + +// profile pointer if the live blob's session id matches the one we're +// closing. Prevents races where a boundary has already overwritten the slot +// with a new session. +const CLEANUP_LUA = ` local cur = redis.call('GET', KEYS[1]) -if not cur or tonumber(ARGV[1]) > tonumber(cur) then - redis.call('SET', KEYS[1], ARGV[1]) - return 1 +if cur then + local ok, parsed = pcall(cjson.decode, cur) + if ok and parsed and parsed.id ~= ARGV[1] then + return 0 + end end -return 0 +redis.call('DEL', KEYS[1]) +redis.call('ZREM', KEYS[2], ARGV[2]) +if KEYS[3] ~= '' then + redis.call('DEL', KEYS[3]) +end +return 1 `; export type SessionIngestResult = @@ -110,8 +117,8 @@ export class SessionBuffer extends BaseBuffer { * Reads the device's current session (if any), decides whether the event * extends it, opens a brand-new one, or boundaries (gap > 30min) → close * the old session and start a fresh one. Writes the updated session blob, - * advances HWM, updates sorted sets, and queues CollapsingMergeTree rows - * for ClickHouse. + * updates the wall-clock index, and queues CollapsingMergeTree rows for + * ClickHouse. * * Returns the action taken so the caller can drive session_start / * session_end event emission. `session_start` / `session_end` events @@ -157,27 +164,41 @@ export class SessionBuffer extends BaseBuffer { } /** - * Remove a closed session from Redis. Called by `createSessionEnd` after - * the session_end event has been emitted, or by the reaper when it - * sweeps stale sessions. + * Remove a closed session from Redis. Atomically gated on `sessionId` via + * Lua: if a different session now occupies the `(projectId, deviceId)` + * slot (because a boundary already overwrote it with a new session), the + * call is a no-op — the new session still needs its blob and sorted-set + * entry. + * + * Blobs no longer have a Redis TTL — this is the only path that removes + * them. The daily vacuum cron is a backstop for the rare case where this + * fails (worker crash, network partition). */ async cleanup({ projectId, deviceId, + sessionId, profileId, }: { projectId: string; deviceId: string; + sessionId: string; profileId?: string | null; }): Promise { - const multi = this.redis.multi(); - multi.del(sessionKey(projectId, deviceId)); - multi.zrem(activeSetKey(projectId), deviceId); - multi.zrem(wallclockSetKey(projectId), deviceId); - if (profileId && profileId !== deviceId) { - multi.del(profileIndexKey(projectId, profileId)); - } - await multi.exec(); + const pointerKey = + profileId && profileId !== deviceId + ? profileIndexKey(projectId, profileId) + : ''; + + await this.redis.eval( + CLEANUP_LUA, + 3, + sessionKey(projectId, deviceId), + wallclockSetKey(projectId), + pointerKey, + sessionId, + deviceId + ); } /** @@ -303,8 +324,13 @@ export class SessionBuffer extends BaseBuffer { } /** - * Atomic write-back: session blob + active/wallclock ZSETs + projects SET + - * HWM advance + profile index + ClickHouse buffer rows + counter. + * Atomic write-back: session blob + wallclock ZSET + projects SET + + * profile index + ClickHouse buffer rows + counter. + * + * No TTLs on the blob or profile index — they are removed exclusively by + * `cleanup()` after `session_end` emission. This guarantees the reaper + * can always find a session's state regardless of how long the worker + * has been down. */ private async persist( current: IClickhouseSession, @@ -312,29 +338,15 @@ export class SessionBuffer extends BaseBuffer { ) { const projectId = current.project_id; const deviceId = current.device_id; - const eventTimeMs = fromClickhouseDate(current.ended_at).getTime(); const wallClockMs = Date.now(); const multi = this.redis.multi(); - multi.set( - sessionKey(projectId, deviceId), - JSON.stringify(current), - 'EX', - SESSION_BLOB_TTL_SECONDS - ); - multi.zadd(activeSetKey(projectId), eventTimeMs.toString(), deviceId); + multi.set(sessionKey(projectId, deviceId), JSON.stringify(current)); multi.zadd(wallclockSetKey(projectId), wallClockMs.toString(), deviceId); - multi.expire(wallclockSetKey(projectId), SESSION_WALLCLOCK_TTL_SECONDS); multi.sadd(PROJECTS_SET_KEY, projectId); - multi.eval(MONOTONIC_HWM_LUA, 1, hwmKey(projectId), eventTimeMs.toString()); if (current.profile_id && current.profile_id !== current.device_id) { - multi.set( - profileIndexKey(projectId, current.profile_id), - deviceId, - 'EX', - SESSION_BLOB_TTL_SECONDS - ); + multi.set(profileIndexKey(projectId, current.profile_id), deviceId); } for (const row of chRows) { diff --git a/packages/queue/src/queues.ts b/packages/queue/src/queues.ts index 2a78c0a3e..9dfd096f4 100644 --- a/packages/queue/src/queues.ts +++ b/packages/queue/src/queues.ts @@ -1,5 +1,6 @@ import { createHash } from 'node:crypto'; import type { + IClickhouseSession, IServiceCreateEventPayload, IServiceEvent, Prisma, @@ -76,6 +77,11 @@ export interface EventsQueuePayloadCreateEvent { export interface EventsQueuePayloadCreateSessionEnd { type: 'createSessionEnd'; payload: IServiceCreateEventPayload; + // Snapshot of the session at the moment the close was decided. Used as a + // fallback when the live Redis blob has expired by the time the job runs, + // and to detect post-enqueue extensions (so we don't close a session that + // received more events in the meantime). + snapshot: IClickhouseSession; } // TODO: Rename `EventsQueuePayloadCreateSessionEnd` @@ -142,6 +148,10 @@ export type CronQueuePayloadSessionReaper = { type: 'sessionReaper'; payload: undefined; }; +export type CronQueuePayloadSessionVacuum = { + type: 'sessionVacuum'; + payload: undefined; +}; export type CronQueuePayload = | CronQueuePayloadSalt | CronQueuePayloadFlushEvents @@ -156,7 +166,8 @@ export type CronQueuePayload = | CronQueuePayloadOnboarding | CronQueuePayloadGscSync | CronQueuePayloadCohortRefresh - | CronQueuePayloadSessionReaper; + | CronQueuePayloadSessionReaper + | CronQueuePayloadSessionVacuum; export type MiscQueuePayloadTrialEndingSoon = { type: 'trialEndingSoon'; From 21626d0c9d00586f0b2da13cce843eef20e54501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Wed, 20 May 2026 15:46:37 +0200 Subject: [PATCH 3/5] wip --- apps/api/src/controllers/track.controller.ts | 16 +- apps/api/src/utils/ids.ts | 37 +- docker-compose.yml | 12 + packages/db/scripts/check-sessions.ts | 424 ++++++++++++++++++ .../db/src/buffers/session-buffer.test.ts | 10 +- packages/db/src/buffers/session-buffer.ts | 6 +- 6 files changed, 494 insertions(+), 11 deletions(-) create mode 100644 packages/db/scripts/check-sessions.ts diff --git a/apps/api/src/controllers/track.controller.ts b/apps/api/src/controllers/track.controller.ts index 1c0c743c2..3b2c14615 100644 --- a/apps/api/src/controllers/track.controller.ts +++ b/apps/api/src/controllers/track.controller.ts @@ -1,10 +1,12 @@ import { generateId } from '@openpanel/common'; import { generateDeviceId, parseUserAgent } from '@openpanel/common/server'; import { + convertClickhouseDateToJs, getProfileById, getSalts, groupBuffer, replayBuffer, + SESSION_TIMEOUT_MS, sessionBuffer, upsertProfile, } from '@openpanel/db'; @@ -182,6 +184,7 @@ async function buildContext( ua, salts, overrideDeviceId, + eventTimeMs: timestamp.timestamp, }); return { @@ -497,7 +500,16 @@ export async function fetchDeviceId( }), ]); - if (current) { + // Blob has no TTL — only treat the session as "current" if its last + // event is within the idle window. Otherwise the SDK should ask the + // server to start a fresh session id on the next event. + const now = Date.now(); + const isLive = (s: typeof current) => + !!s && + now - convertClickhouseDateToJs(s.ended_at).getTime() < + SESSION_TIMEOUT_MS; + + if (current && isLive(current)) { return reply.status(200).send({ deviceId: currentDeviceId, sessionId: current.id, @@ -505,7 +517,7 @@ export async function fetchDeviceId( }); } - if (previous) { + if (previous && isLive(previous)) { return reply.status(200).send({ deviceId: previousDeviceId, sessionId: previous.id, diff --git a/apps/api/src/utils/ids.ts b/apps/api/src/utils/ids.ts index 7ea035519..dfc45551b 100644 --- a/apps/api/src/utils/ids.ts +++ b/apps/api/src/utils/ids.ts @@ -1,6 +1,11 @@ import crypto from 'node:crypto'; import { generateDeviceId } from '@openpanel/common/server'; -import { sessionBuffer } from '@openpanel/db'; +import { + convertClickhouseDateToJs, + SESSION_TIMEOUT_MS, + sessionBuffer, +} from '@openpanel/db'; +import type { IClickhouseSession } from '@openpanel/db'; export async function getDeviceId({ projectId, @@ -8,12 +13,16 @@ export async function getDeviceId({ ua, salts, overrideDeviceId, + eventTimeMs, }: { projectId: string; ip: string; ua: string | undefined; salts: { current: string; previous: string }; overrideDeviceId?: string; + /** Event timestamp (ms). Used to decide whether an existing session is + * still within its idle window. Defaults to `Date.now()`. */ + eventTimeMs?: number; }) { if (overrideDeviceId) { // Resolve a caller-supplied device id through the same path as internal ones @@ -46,6 +55,7 @@ export async function getDeviceId({ projectId, currentDeviceId, previousDeviceId, + eventTimeMs: eventTimeMs ?? Date.now(), }); } @@ -54,14 +64,34 @@ interface DeviceIdResult { sessionId: string; } +/** + * Returns true when an existing session is recent enough that the incoming + * event should EXTEND it rather than start a new session. + * + * Critical: blobs no longer have a Redis TTL (so the reaper can always find + * them), which means an existing session blob may linger past its idle + * window. We must NOT blindly reuse `existing.id` — if we did, the worker's + * boundary detection would open a "new" session with the same id as the + * closed one, breaking the id-based extension check in createSessionEnd. + */ +function withinIdleWindow( + session: IClickhouseSession, + eventTimeMs: number +): boolean { + const lastEventMs = convertClickhouseDateToJs(session.ended_at).getTime(); + return eventTimeMs - lastEventMs < SESSION_TIMEOUT_MS; +} + async function getInfoFromSession({ projectId, currentDeviceId, previousDeviceId, + eventTimeMs, }: { projectId: string; currentDeviceId: string; previousDeviceId: string; + eventTimeMs: number; }): Promise { try { const [current, previous] = await Promise.all([ @@ -75,10 +105,10 @@ async function getInfoFromSession({ }), ]); - if (current) { + if (current && withinIdleWindow(current, eventTimeMs)) { return { deviceId: currentDeviceId, sessionId: current.id }; } - if (previous) { + if (previous && withinIdleWindow(previous, eventTimeMs)) { return { deviceId: previousDeviceId, sessionId: previous.id }; } } catch (error) { @@ -90,6 +120,7 @@ async function getInfoFromSession({ sessionId: getSessionId({ projectId, deviceId: currentDeviceId, + eventMs: eventTimeMs, graceMs: 5 * 1000, windowMs: 1000 * 60 * 30, }), diff --git a/docker-compose.yml b/docker-compose.yml index c3468227f..189574789 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -61,3 +61,15 @@ services: ports: - "19092:19092" # Kafka API (external) - "9644:9644" # Admin API + + op-rp-console: + image: redpandadata/console:v3.7.2 + restart: always + depends_on: + - op-rp + environment: + KAFKA_BROKERS: op-rp:9092 + REDPANDA_ADMINAPI_ENABLED: "true" + REDPANDA_ADMINAPI_URLS: http://op-rp:9644 + ports: + - "8091:8080" diff --git a/packages/db/scripts/check-sessions.ts b/packages/db/scripts/check-sessions.ts new file mode 100644 index 000000000..1a1cc8d7a --- /dev/null +++ b/packages/db/scripts/check-sessions.ts @@ -0,0 +1,424 @@ +// Read-only health check for the session lifecycle pipeline. +// +// Inspects Redis state, the sessions BullMQ queue, both internal buffers, +// and ClickHouse — then reconciles them to surface anything stuck or +// drifting. Safe to run anytime; never mutates state. +// +// Usage: +// pnpm with-env tsx scripts/check-sessions.ts +// pnpm with-env tsx scripts/check-sessions.ts --verbose +// pnpm with-env tsx scripts/check-sessions.ts --project beatchurn-website +// pnpm with-env tsx scripts/check-sessions.ts --hours 48 + +import { getSafeJson } from '@openpanel/json'; +import { sessionsQueue } from '@openpanel/queue'; +import { getRedisCache } from '@openpanel/redis'; +import { ch } from '../src/clickhouse/client'; +import type { IClickhouseSession } from '../src/services/session.service'; + +const DEADMAN_MS = Number.parseInt( + process.env.SESSION_REAPER_WALLCLOCK_DEADMAN_MS || String(30 * 60 * 1000), + 10 +); +const REAPER_INTERVAL_MS = 5 * 60 * 1000; +// "Eligible" = past the deadman, reaper is expected to close shortly. +// "Stuck" = past 3x reaper interval beyond the deadman — reaper is behind. +const STUCK_GRACE_MS = REAPER_INTERVAL_MS * 3; +const STUCK_CUTOFF_MS = DEADMAN_MS + STUCK_GRACE_MS; +const SAMPLE_SIZE_PER_PROJECT = 100; + +type ProjectStat = { + project: string; + active: number; + eligible: number; + stuck: number; + orphans: number; + oldestAgeMin: number; +}; + +type ChCounts = { + starts1h: number; + ends1h: number; + starts24h: number; + ends24h: number; +}; + +function fmt(n: number, w: number) { + return String(n).padStart(w); +} + +function header(title: string) { + console.log(''); + console.log('─'.repeat(70)); + console.log(title); + console.log('─'.repeat(70)); +} + +async function inspectRedis(projects: string[]): Promise<{ + total: number; + totalEligible: number; + totalStuck: number; + totalOrphans: number; + stats: ProjectStat[]; +}> { + const redis = getRedisCache(); + const now = Date.now(); + const eligibleCutoff = now - DEADMAN_MS; + const stuckCutoff = now - STUCK_CUTOFF_MS; + + const stats: ProjectStat[] = []; + let total = 0; + let totalEligible = 0; + let totalStuck = 0; + let totalOrphans = 0; + + for (const project of projects) { + const wallclockKey = `session:wallclock:${project}`; + + const [active, eligible, stuck, oldestEntry] = await Promise.all([ + redis.zcard(wallclockKey), + redis.zcount(wallclockKey, '-inf', eligibleCutoff), + redis.zcount(wallclockKey, '-inf', stuckCutoff), + redis.zrange(wallclockKey, 0, 0, 'WITHSCORES'), + ]); + + const oldestAgeMin = + oldestEntry.length === 2 ? (now - Number(oldestEntry[1])) / 60_000 : 0; + + // Sample N oldest entries and verify their blob exists. + const sampleSize = Math.min(SAMPLE_SIZE_PER_PROJECT, active); + let orphans = 0; + if (sampleSize > 0) { + const sample = await redis.zrange(wallclockKey, 0, sampleSize - 1); + const multi = redis.multi(); + for (const did of sample) { + multi.exists(`session:${project}:${did}`); + } + const results = await multi.exec(); + for (const entry of results ?? []) { + if (Number(entry?.[1] ?? 0) === 0) orphans++; + } + } + + stats.push({ project, active, eligible, stuck, orphans, oldestAgeMin }); + total += active; + totalEligible += eligible; + totalStuck += stuck; + totalOrphans += orphans; + } + + return { total, totalEligible, totalStuck, totalOrphans, stats }; +} + +async function inspectQueue() { + const [waiting, delayed, active, failed, completed] = await Promise.all([ + sessionsQueue.getWaitingCount(), + sessionsQueue.getDelayedCount(), + sessionsQueue.getActiveCount(), + sessionsQueue.getFailedCount(), + sessionsQueue.getCompletedCount(), + ]); + + const failedSample = failed > 0 ? await sessionsQueue.getFailed(0, 4) : []; + + return { waiting, delayed, active, failed, completed, failedSample }; +} + +async function inspectBuffers() { + const redis = getRedisCache(); + const [eventCounter, sessionCounter, eventListLen, sessionListLen] = + await Promise.all([ + redis.get('buffer-counter:event'), + redis.get('buffer-counter:session'), + redis.llen('event-buffer'), + redis.llen('session-buffer'), + ]); + + return { + event: { + counter: Number(eventCounter ?? 0), + listLen: Number(eventListLen ?? 0), + }, + session: { + counter: Number(sessionCounter ?? 0), + listLen: Number(sessionListLen ?? 0), + }, + }; +} + +async function inspectClickhouse( + projects: string[] | null, + hours: number +): Promise> { + const projectClause = projects?.length + ? `AND project_id IN (${projects.map((p) => `'${p.replace(/'/g, "''")}'`).join(',')})` + : ''; + const query = ` + SELECT + project_id, + name, + sum(if(created_at > now() - INTERVAL 1 HOUR, 1, 0)) AS h1, + sum(if(created_at > now() - INTERVAL ${hours} HOUR, 1, 0)) AS hN + FROM events + WHERE name IN ('session_start', 'session_end') + AND created_at > now() - INTERVAL ${hours} HOUR + ${projectClause} + GROUP BY project_id, name + `; + const res = await ch.query({ query, format: 'JSONEachRow' }); + const rows = await res.json<{ + project_id: string; + name: string; + h1: string; + hN: string; + }>(); + + const byProject = new Map(); + for (const r of rows) { + const cur = byProject.get(r.project_id) ?? { + starts1h: 0, + ends1h: 0, + starts24h: 0, + ends24h: 0, + }; + if (r.name === 'session_start') { + cur.starts1h = Number(r.h1); + cur.starts24h = Number(r.hN); + } else { + cur.ends1h = Number(r.h1); + cur.ends24h = Number(r.hN); + } + byProject.set(r.project_id, cur); + } + return byProject; +} + +async function inspectSessionEndClaims() { + const redis = getRedisCache(); + let cursor = '0'; + let count = 0; + do { + const [next, batch] = await redis.scan( + cursor, + 'MATCH', + 'session:end:emitted:*', + 'COUNT', + 1000 + ); + cursor = next; + count += batch.length; + } while (cursor !== '0' && count < 100_000); + return count; +} + +async function sampleOldestSession( + projectId: string, + deviceId: string +): Promise { + const blob = await getRedisCache().get(`session:${projectId}:${deviceId}`); + return blob ? getSafeJson(blob) : null; +} + +async function main() { + const args = process.argv.slice(2); + const verbose = args.includes('--verbose'); + const projectIdx = args.indexOf('--project'); + const projectFilter = projectIdx >= 0 ? args[projectIdx + 1] : null; + const hoursIdx = args.indexOf('--hours'); + const hours = hoursIdx >= 0 ? Number(args[hoursIdx + 1]) : 24; + + const redis = getRedisCache(); + + console.log('═'.repeat(70)); + console.log('SESSION LIFECYCLE HEALTH CHECK'); + console.log(`time: ${new Date().toISOString()}`); + console.log(`deadman: ${DEADMAN_MS / 60_000} min`); + console.log(`stuck cutoff: ${STUCK_CUTOFF_MS / 60_000} min (deadman + 3 reaper ticks)`); + console.log(`ch window: ${hours}h`); + if (projectFilter) console.log(`project: ${projectFilter}`); + console.log('═'.repeat(70)); + + // 1. Redis inventory + const allProjects = await redis.smembers('session:projects'); + const projects = projectFilter + ? [projectFilter] + : allProjects.slice().sort(); + + header('REDIS — active sessions'); + console.log(` ${allProjects.length} projects in session:projects`); + + const redisInfo = await inspectRedis(projects); + console.log(` ${redisInfo.total} total active sessions`); + console.log(` ${redisInfo.totalEligible} past deadman (expected to reap soon)`); + console.log( + ` ${redisInfo.totalStuck} past deadman + ${STUCK_GRACE_MS / 60_000}min (stuck — reaper is behind)` + ); + console.log( + ` ${redisInfo.totalOrphans} wallclock entries with no blob (in sampled ${SAMPLE_SIZE_PER_PROJECT}/project)` + ); + + if (verbose && redisInfo.stats.length > 0) { + console.log(''); + console.log( + ` ${'project'.padEnd(40)} ${'active'.padStart(8)} ${'eligible'.padStart(10)} ${'stuck'.padStart(8)} ${'orphans'.padStart(8)} ${'oldest'.padStart(10)}` + ); + for (const s of redisInfo.stats.sort((a, b) => b.active - a.active)) { + console.log( + ` ${s.project.padEnd(40)} ${fmt(s.active, 8)} ${fmt(s.eligible, 10)} ${fmt(s.stuck, 8)} ${fmt(s.orphans, 8)} ${s.oldestAgeMin.toFixed(1).padStart(8)}m` + ); + } + } + + // 2. Sessions queue + header('SESSIONS QUEUE — createSessionEnd jobs'); + const queue = await inspectQueue(); + console.log(` waiting: ${queue.waiting}`); + console.log(` delayed: ${queue.delayed}`); + console.log(` active: ${queue.active}`); + console.log(` failed: ${queue.failed}`); + console.log(` completed: ${queue.completed} (removed-on-complete)`); + if (queue.failedSample.length > 0) { + console.log(''); + console.log(' Latest failed jobs:'); + for (const job of queue.failedSample) { + const when = new Date(job.timestamp).toISOString(); + const reason = (job.failedReason ?? '').slice(0, 200); + console.log(` - [${when}] ${job.id}`); + console.log(` ${reason}`); + } + } + + // 3. Buffers + header('BUFFERS — pending writes to ClickHouse'); + const buffers = await inspectBuffers(); + console.log( + ` event buffer: counter=${buffers.event.counter} list=${buffers.event.listLen}` + ); + console.log( + ` session buffer: counter=${buffers.session.counter} list=${buffers.session.listLen}` + ); + + // 4. Dedup claims + header('IDEMPOTENCY — session:end:emitted:* claim keys'); + const claimCount = await inspectSessionEndClaims(); + console.log( + ` ${claimCount} active claim keys (TTL 2h; bounds at ~last 2h of closes)` + ); + + // 5. ClickHouse reconciliation + header(`CLICKHOUSE — session_start vs session_end (last ${hours}h)`); + const chCounts = await inspectClickhouse( + projectFilter ? [projectFilter] : null, + hours + ); + + let totalStarts1h = 0; + let totalEnds1h = 0; + let totalStartsN = 0; + let totalEndsN = 0; + + console.log(''); + console.log( + ` ${'project'.padEnd(40)} ${'start 1h'.padStart(9)} ${'end 1h'.padStart(8)} ${'start ' + hours + 'h'}`.padEnd(58) + + `${('end ' + hours + 'h').padStart(8)} ${'open'.padStart(8)}` + ); + for (const [project, c] of [...chCounts.entries()].sort()) { + const open = c.starts24h - c.ends24h; + console.log( + ` ${project.padEnd(40)} ${fmt(c.starts1h, 9)} ${fmt(c.ends1h, 8)} ${fmt(c.starts24h, 10)} ${fmt(c.ends24h, 8)} ${fmt(open, 8)}` + ); + totalStarts1h += c.starts1h; + totalEnds1h += c.ends1h; + totalStartsN += c.starts24h; + totalEndsN += c.ends24h; + } + const openInCh = totalStartsN - totalEndsN; + console.log(''); + console.log( + ` ${'TOTAL'.padEnd(40)} ${fmt(totalStarts1h, 9)} ${fmt(totalEnds1h, 8)} ${fmt(totalStartsN, 10)} ${fmt(totalEndsN, 8)} ${fmt(openInCh, 8)}` + ); + + // 6. Reconciliation + header('RECONCILIATION — Redis active vs CH open'); + const diff = redisInfo.total - openInCh; + const diffPct = + openInCh > 0 ? ((Math.abs(diff) / openInCh) * 100).toFixed(1) : '∞'; + console.log(` Redis active sessions: ${redisInfo.total}`); + console.log(` CH starts - ends in ${hours}h window: ${openInCh}`); + console.log(` Diff: ${diff} (${diffPct}%)`); + console.log( + ` Note: diff > 0 means some "open" sessions in Redis were started before the ${hours}h window.` + ); + + // 7. Sample the oldest session in Redis if anything looks suspicious + if (verbose && redisInfo.stats.some((s) => s.oldestAgeMin > 60 * 6)) { + header('SAMPLE — oldest session per project (any > 6h)'); + for (const s of redisInfo.stats.filter((x) => x.oldestAgeMin > 60 * 6)) { + const wallclockKey = `session:wallclock:${s.project}`; + const [oldest] = await redis.zrange(wallclockKey, 0, 0); + if (oldest) { + const blob = await sampleOldestSession(s.project, oldest); + if (blob) { + console.log(` ${s.project} / ${oldest}`); + console.log(` id: ${blob.id}`); + console.log(` created_at: ${blob.created_at}`); + console.log(` ended_at: ${blob.ended_at}`); + console.log(` event_count: ${blob.event_count}`); + } + } + } + } + + // 8. Summary + header('SUMMARY'); + const issues: string[] = []; + if (redisInfo.totalStuck > 0) { + issues.push( + `${redisInfo.totalStuck} sessions past the stuck cutoff — reaper may be unhealthy` + ); + } + if (redisInfo.totalOrphans > 0) { + issues.push( + `${redisInfo.totalOrphans} orphan wallclock entries — cleanup() raced or vacuum is overdue` + ); + } + if (queue.failed > 0) { + issues.push(`${queue.failed} failed session_end jobs`); + } + if (queue.waiting + queue.active > 5000) { + issues.push( + `session_end backlog (waiting=${queue.waiting} active=${queue.active})` + ); + } + if (buffers.event.listLen > 50_000) { + issues.push(`event buffer is backed up (${buffers.event.listLen} items)`); + } + if (buffers.session.listLen > 50_000) { + issues.push( + `session buffer is backed up (${buffers.session.listLen} items)` + ); + } + // The Redis-vs-CH diff is INFORMATIONAL, not an issue on its own: + // diff > 0: more in Redis than CH's window — older sessions still active + // diff < 0: more starts in CH than Redis active — legacy starts without ends + // (e.g. from before a bug fix) that will never close + // Only flag if both reaper is behind AND the diff is growing — which we can't + // know from a single run. So just print it; the reaper-behind and stuck + // counters above will catch live issues. + + if (issues.length === 0) { + console.log(' [OK] No issues detected.'); + } else { + console.log(' [WARN] Issues:'); + for (const i of issues) console.log(` - ${i}`); + } + + await redis.quit(); + await ch.close?.(); + process.exit(issues.length > 0 ? 1 : 0); +} + +main().catch((e) => { + console.error(e); + process.exit(1); +}); diff --git a/packages/db/src/buffers/session-buffer.test.ts b/packages/db/src/buffers/session-buffer.test.ts index 282c84f99..e72a26d17 100644 --- a/packages/db/src/buffers/session-buffer.test.ts +++ b/packages/db/src/buffers/session-buffer.test.ts @@ -61,8 +61,10 @@ beforeEach(async () => { ...(await redis.keys('session*')), ...(await redis.keys('lock:session')), ]; - if (keys.length > 0) { - await redis.del(...keys); + // Spread breaks at ~100k args — del in chunks so old test state can't kill us. + const BATCH = 1000; + for (let i = 0; i < keys.length; i += BATCH) { + await redis.del(...keys.slice(i, i + BATCH)); } vi.mocked(ch.insert).mockResolvedValue(undefined as any); }); @@ -110,9 +112,7 @@ describe('SessionBuffer', () => { // Blob and profile pointer must survive arbitrarily long reaper outages, // so neither carries a Redis TTL. expect(await redis.ttl(`session:${projectId}:device-Y`)).toBe(-1); - expect( - await redis.ttl(`session:profile:${projectId}:profile-X`) - ).toBe(-1); + expect(await redis.ttl(`session:profile:${projectId}:profile-X`)).toBe(-1); }); it('writes profile→device pointer when profile_id differs from device_id', async () => { diff --git a/packages/db/src/buffers/session-buffer.ts b/packages/db/src/buffers/session-buffer.ts index 3eeea0b33..54b0ffc4f 100644 --- a/packages/db/src/buffers/session-buffer.ts +++ b/packages/db/src/buffers/session-buffer.ts @@ -281,11 +281,15 @@ export class SessionBuffer extends BaseBuffer { */ private newSession(payload: IServiceCreateEventPayload): IClickhouseSession { const createdAt = toClickhouseDate(payload.createdAt); + // For anonymous traffic the SDK doesn't send a profileId — fall back to + // the deviceId so each anonymous device shows up as a unique visitor in + // the dashboard. Matches what `createEvent` does for the events table. + const profileId = payload.profileId || payload.deviceId; return { id: payload.sessionId, project_id: payload.projectId, device_id: payload.deviceId, - profile_id: payload.profileId ?? '', + profile_id: profileId, is_bounce: true, created_at: createdAt, ended_at: createdAt, From 9e9a135578b88bdf8643371e9b01184035673cb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Tue, 9 Jun 2026 10:49:35 +0200 Subject: [PATCH 4/5] final things --- apps/api/e2e/README.md | 54 ++++ apps/api/e2e/lib.ts | 231 +++++++++++++++ apps/api/e2e/session-e2e.ts | 268 ++++++++++++++++++ apps/api/e2e/session-stress.ts | 235 +++++++++++++++ apps/api/package.json | 2 + apps/api/src/utils/ids.test.ts | 146 +++++----- apps/api/src/utils/ids.ts | 70 +++-- apps/worker/src/boot-debug.ts | 2 + .../src/jobs/cron.session-reaper.test.ts | 119 ++++++++ .../src/jobs/cron.session-vacuum.test.ts | 90 ++++++ apps/worker/src/jobs/cron.ts | 2 +- .../jobs/events.create-session-end.test.ts | 153 ++++++++++ .../src/jobs/events.incoming-events.test.ts | 4 + apps/worker/src/metrics.ts | 13 +- packages/db/src/buffers/base-buffer.ts | 6 +- .../db/src/buffers/session-buffer.test.ts | 42 +++ packages/db/src/buffers/session-buffer.ts | 139 +-------- 17 files changed, 1332 insertions(+), 244 deletions(-) create mode 100644 apps/api/e2e/README.md create mode 100644 apps/api/e2e/lib.ts create mode 100644 apps/api/e2e/session-e2e.ts create mode 100644 apps/api/e2e/session-stress.ts create mode 100644 apps/worker/src/jobs/cron.session-reaper.test.ts create mode 100644 apps/worker/src/jobs/cron.session-vacuum.test.ts create mode 100644 apps/worker/src/jobs/events.create-session-end.test.ts diff --git a/apps/api/e2e/README.md b/apps/api/e2e/README.md new file mode 100644 index 000000000..8487466e1 --- /dev/null +++ b/apps/api/e2e/README.md @@ -0,0 +1,54 @@ +# Session E2E + +Two harnesses over a shared foundation (`lib.ts`), both driving the **real stack** +over HTTP and asserting state in **both ClickHouse and Redis**: + +- `session-e2e.ts` (`e2e:sessions`) — **correctness**: the full lifecycle for one + session per scenario (open/extend/close via reaper + boundary, replay, identify), + including Redis cleanup. +- `session-stress.ts` (`e2e:sessions:stress`) — **volume + drain**: ramps out many + sessions, then drives reaper + buffer flushes until *everything* has drained + (every `session_end` emitted, Redis cleaned, session buffer empty) and reconciles + the ClickHouse counts. Exits only when nothing is left open. + +## What it covers + +| Scenario | Asserts | +|----------|---------| +| Single session → reaper close | session blob + wallclock + projects-set in Redis; one `session_start` + N events in CH; after the reaper closes it: one `session_end`, a collapsed `sessions` row, and Redis fully cleaned (blob + wallclock gone, idempotency claim present). | +| Boundary split | a >idle-window gap opens a NEW session id and emits a `session_end` for the first + a `session_start` for the second. | +| Replay | a replay chunk lands in `session_replay_chunks` under the echoed session id. | +| Identify | `session:profile:{pid}:{profileId}` pointer written; events carry the identified `profile_id`. | + +## Running + +Sessions idle out after 30 min by default. Shrink that and start the stack with +the **same** value the harness uses, then run the harness: + +```bash +# 1. Start the stack with a short idle window (Docker must be up: pnpm dock:up) +SESSION_TIMEOUT_MS=4000 pnpm dev + +# 2. In another terminal, run the harness with the SAME timeout +SESSION_TIMEOUT_MS=4000 pnpm --filter @openpanel/api e2e:sessions + +# …or the stress + drain test (500 sessions by default) +SESSION_TIMEOUT_MS=4000 pnpm --filter @openpanel/api e2e:sessions:stress +``` + +Stress tunables (env): `E2E_SESSIONS` (500), `E2E_CONCURRENCY` (25), +`E2E_EVENTS_PER_SESSION` (3), `E2E_DRAIN_TIMEOUT_MS` (120000). + +It exits non-zero if any check fails and prints a summary. Total run is ~30–60s +with a 4s window (each close waits roughly one idle window). + +The harness triggers the reaper on demand via the worker's `/debug/cron` +endpoint, so it never waits for the 5-minute reaper cron. + +### Notes +- Uses a dedicated, isolated project (`e2e-sessions`) and a throwaway client + (`ignoreCorsAndSecret`), created/upserted automatically under org `openpanel-dev`. +- Each run uses fresh device IPs, so reruns don't collide with prior state. +- Overridable: `E2E_API_URL` (default `:3333`), `E2E_WORKER_URL` (default `:9999`). +- The harness and the stack **must share the same `SESSION_TIMEOUT_MS`** — the + harness derives its idle waits from it. diff --git a/apps/api/e2e/lib.ts b/apps/api/e2e/lib.ts new file mode 100644 index 000000000..42378e5a0 --- /dev/null +++ b/apps/api/e2e/lib.ts @@ -0,0 +1,231 @@ +/** + * Shared building blocks for the session E2E + stress harnesses: + * config, an HTTP track client, the reaper trigger, Redis/ClickHouse helpers, + * fixtures, polling, and a tiny check/report framework. + */ + +import { ClientType, chQuery, db, getClientByIdCached } from '@openpanel/db'; +import { getRedisCache } from '@openpanel/redis'; + +// ── Config ────────────────────────────────────────────────────────────────── +export const API_URL = process.env.E2E_API_URL || 'http://localhost:3333'; +export const WORKER_URL = process.env.E2E_WORKER_URL || 'http://localhost:9999'; +export const ORG_ID = 'openpanel-dev'; +export const PROJECT_ID = 'e2e-sessions'; +export const CLIENT_ID = 'e2e1e2e1-0000-4000-8000-000000000001'; +export const UA = + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36'; + +export const SESSION_TIMEOUT_MS = Number.parseInt( + process.env.SESSION_TIMEOUT_MS || String(1000 * 60 * 30), + 10 +); +/** How long to wait for a session to fall outside its idle window before closing. */ +export const IDLE_WAIT_MS = SESSION_TIMEOUT_MS + 2000; + +export const redis = getRedisCache(); +export const runId = Date.now(); + +export { chQuery }; + +// ── Redis key helpers ─────────────────────────────────────────────────────── +export const sessionKey = (deviceId: string) => `session:${PROJECT_ID}:${deviceId}`; +export const wallclockKey = `session:wallclock:${PROJECT_ID}`; +export const profileKey = (profileId: string) => + `session:profile:${PROJECT_ID}:${profileId}`; +export const claimKey = (deviceId: string, sessionId: string) => + `session:end:emitted:${PROJECT_ID}:${deviceId}:${sessionId}`; +/** The session-buffer's Redis list (ground-truth pending CH rows). */ +export const SESSION_BUFFER_LIST = 'session-buffer'; + +export async function getBlob(deviceId: string) { + const raw = await redis.get(sessionKey(deviceId)); + return raw ? JSON.parse(raw) : null; +} + +// ── Timing ────────────────────────────────────────────────────────────────── +export const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +/** Poll `fn` until it returns a truthy value or the timeout elapses. */ +export async function pollUntil( + fn: () => Promise, + { timeoutMs = 30_000, intervalMs = 750 } = {} +): Promise { + const deadline = Date.now() + timeoutMs; + // biome-ignore lint/nursery/noConstantCondition: poll loop + while (true) { + const value = await fn(); + if (value) return value; + if (Date.now() >= deadline) return null; + await sleep(intervalMs); + } +} + +// ── Report framework ────────────────────────────────────────────────────── +type Result = { scenario: string; name: string; ok: boolean; detail?: string }; +const results: Result[] = []; +let currentScenario = 'setup'; + +export function scenario(name: string) { + currentScenario = name; + console.log(`\n▶ ${name}`); +} +export function check(name: string, ok: boolean, detail?: string) { + results.push({ scenario: currentScenario, name, ok, detail }); + console.log(` ${ok ? '✓' : '✗'} ${name}${detail && !ok ? ` — ${detail}` : ''}`); +} +/** Print the summary and return the number of failed checks. */ +export function summarize(): number { + const failed = results.filter((r) => !r.ok); + console.log(`\n${'─'.repeat(60)}`); + console.log(`${results.length - failed.length}/${results.length} checks passed`); + if (failed.length) { + console.log('\nFailures:'); + for (const f of failed) { + console.log(` ✗ [${f.scenario}] ${f.name}${f.detail ? ` — ${f.detail}` : ''}`); + } + } + return failed.length; +} + +// ── HTTP ──────────────────────────────────────────────────────────────────── +export type TrackResponse = { deviceId: string; sessionId: string }; + +export async function track(body: unknown, ip: string): Promise { + const res = await fetch(`${API_URL}/track`, { + method: 'POST', + headers: { + 'content-type': 'application/json', + 'openpanel-client-id': CLIENT_ID, + 'user-agent': UA, + 'x-client-ip': ip, + }, + body: JSON.stringify(body), + }); + if (!res.ok) { + throw new Error(`POST /track ${res.status}: ${await res.text()}`); + } + return (await res.json()) as TrackResponse; +} + +export const screenView = ( + ip: string, + path: string, + extra: Record = {} +) => + track( + { + type: 'track', + payload: { + name: 'screen_view', + properties: { __path: `https://e2e.test${path}`, __ip: ip, ...extra }, + }, + }, + ip + ); + +/** Run a worker cron on demand via the local /debug/cron endpoint. */ +export async function triggerCron(type: string) { + const res = await fetch(`${WORKER_URL}/debug/cron/${type}`, { + method: 'POST', + headers: { accept: 'application/json' }, + }); + if (!res.ok) { + throw new Error(`trigger cron ${type} ${res.status}: ${await res.text()}`); + } +} + +export const triggerReaper = () => triggerCron('sessionReaper'); + +// ── ClickHouse counting (scoped to a set of session ids for run isolation) ── +const quoteList = (ids: string[]) => + ids.map((id) => `'${id.replace(/'/g, "")}'`).join(','); + +export async function countByName( + sessionIds: string[], + name: string +): Promise { + if (sessionIds.length === 0) return 0; + const rows = await chQuery<{ c: string }>( + `SELECT count() AS c FROM events WHERE project_id = '${PROJECT_ID}' AND name = '${name}' AND session_id IN (${quoteList(sessionIds)})` + ); + return Number(rows[0]?.c ?? 0); +} + +// ── Fixtures ──────────────────────────────────────────────────────────────── +export async function ensureFixtures() { + scenario('setup: project + client'); + try { + await db.organization.upsert({ + where: { id: ORG_ID }, + create: { id: ORG_ID, name: 'OpenPanel Dev' }, + update: {}, + }); + await db.project.upsert({ + where: { id: PROJECT_ID }, + create: { id: PROJECT_ID, name: 'E2E Sessions', organizationId: ORG_ID }, + update: {}, + }); + await db.client.upsert({ + where: { id: CLIENT_ID }, + create: { + id: CLIENT_ID, + name: 'e2e', + organizationId: ORG_ID, + projectId: PROJECT_ID, + type: ClientType.write, + ignoreCorsAndSecret: true, + secret: null, + }, + update: { ignoreCorsAndSecret: true, projectId: PROJECT_ID }, + }); + await getClientByIdCached.clear(CLIENT_ID); + check('fixtures ready', true); + } catch (error) { + check('fixtures ready', false, (error as Error).message); + throw error; + } +} + +export async function preflight() { + scenario('preflight'); + const api = await fetch(`${API_URL}/`) + .then((r) => r.ok || r.status === 404) + .catch(() => false); + check(`api reachable at ${API_URL}`, !!api); + const worker = await fetch(`${WORKER_URL}/debug/cron`) + .then((r) => r.ok) + .catch(() => false); + check(`worker debug reachable at ${WORKER_URL}`, !!worker); + if (!api || !worker) { + throw new Error('Stack not reachable — start it with `pnpm dev` first.'); + } + if (SESSION_TIMEOUT_MS > 60_000) { + console.warn( + `\n⚠ SESSION_TIMEOUT_MS=${SESSION_TIMEOUT_MS}ms — this run will be slow.\n` + + ' Re-run the stack AND the harness with e.g. SESSION_TIMEOUT_MS=4000.' + ); + } +} + +/** Run `fn` over `items` with at most `concurrency` in flight. */ +export async function runPool( + items: T[], + concurrency: number, + fn: (item: T, index: number) => Promise +): Promise { + let next = 0; + const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => { + while (true) { + const i = next++; + if (i >= items.length) return; + await fn(items[i]!, i); + } + }); + await Promise.all(workers); +} + +export async function shutdown(failed: number): Promise { + await redis.quit().catch(() => {}); + process.exit(failed ? 1 : 0); +} diff --git a/apps/api/e2e/session-e2e.ts b/apps/api/e2e/session-e2e.ts new file mode 100644 index 000000000..204d8a40a --- /dev/null +++ b/apps/api/e2e/session-e2e.ts @@ -0,0 +1,268 @@ +/** + * Session correctness E2E. + * + * Drives the REAL stack over HTTP and asserts the resulting state in BOTH + * ClickHouse and Redis through the full lifecycle — open, extend, close (via + * reaper AND via boundary), replay, identify — including Redis cleanup. + * + * Run (shrink the idle window; start the stack with the SAME value): + * SESSION_TIMEOUT_MS=4000 pnpm dev + * SESSION_TIMEOUT_MS=4000 pnpm --filter @openpanel/api e2e:sessions + * + * For a high-volume drain test see session-stress.ts (`e2e:sessions:stress`). + */ + +import { + chQuery, + check, + countByName, + ensureFixtures, + getBlob, + IDLE_WAIT_MS, + PROJECT_ID, + pollUntil, + preflight, + redis, + runId, + scenario, + screenView, + SESSION_TIMEOUT_MS, + shutdown, + sleep, + summarize, + track, + triggerReaper, + wallclockKey, + claimKey, + sessionKey, + profileKey, + WORKER_URL, + API_URL, +} from './lib'; + +async function scenarioSingleSession() { + scenario('single session → reaper close → cleanup'); + const ip = `10.${(runId >> 16) & 255}.1.${runId & 255 || 1}`; + + const first = await screenView(ip, '/a'); + await screenView(ip, '/b'); + await screenView(ip, '/c'); + const { deviceId, sessionId } = first; + check('track returned a sessionId', !!sessionId, `got '${sessionId}'`); + + const blob = await pollUntil(async () => { + const b = await getBlob(deviceId); + return b?.id === sessionId ? b : null; + }); + check('redis: session blob exists with matching id', !!blob); + check( + 'redis: device in wallclock index', + (await redis.zscore(wallclockKey, deviceId)) !== null + ); + check( + 'redis: project in active set', + (await redis.sismember('session:projects', PROJECT_ID)) === 1 + ); + + const starts = await pollUntil(async () => { + const c = await countByName([sessionId], 'session_start'); + return c > 0 ? c : null; + }); + check('clickhouse: exactly one session_start', starts === 1, `got ${starts}`); + const views = await pollUntil(async () => { + const c = await countByName([sessionId], 'screen_view'); + return c >= 3 ? c : null; + }); + check('clickhouse: 3 screen_views ingested', !!views, `got ${views}`); + + console.log(` …waiting ${IDLE_WAIT_MS}ms for idle window, then reaping`); + await sleep(IDLE_WAIT_MS); + await triggerReaper(); + + const ends = await pollUntil(async () => { + const c = await countByName([sessionId], 'session_end'); + return c > 0 ? c : null; + }); + check('clickhouse: exactly one session_end', ends === 1, `got ${ends}`); + + const sessionRow = await pollUntil(async () => { + const rows = await chQuery<{ is_bounce: boolean }>( + `SELECT is_bounce FROM sessions FINAL WHERE project_id = '${PROJECT_ID}' AND id = '${sessionId}' LIMIT 1` + ); + return rows[0] ?? null; + }); + check('clickhouse: sessions row present (collapsed)', !!sessionRow); + check( + 'clickhouse: session not a bounce (3 screen_views)', + sessionRow?.is_bounce === false || Number(sessionRow?.is_bounce) === 0, + `is_bounce=${sessionRow?.is_bounce}` + ); + + const cleaned = await pollUntil(async () => + (await redis.get(sessionKey(deviceId))) === null ? true : null + ); + check('redis: session blob removed after close', !!cleaned); + check( + 'redis: wallclock entry removed after close', + (await redis.zscore(wallclockKey, deviceId)) === null + ); + check( + 'redis: session_end idempotency claim recorded', + (await redis.get(claimKey(deviceId, sessionId))) !== null + ); +} + +async function scenarioBoundary() { + scenario('boundary split → first session closes, second opens'); + const ip = `10.${(runId >> 16) & 255}.2.${runId & 255 || 1}`; + + const a = await screenView(ip, '/x'); + console.log(` …waiting ${IDLE_WAIT_MS}ms so the next event crosses the boundary`); + await sleep(IDLE_WAIT_MS); + const b = await screenView(ip, '/y'); + + check( + 'second event opened a NEW session id', + a.sessionId !== b.sessionId && !!b.sessionId, + `${a.sessionId} vs ${b.sessionId}` + ); + + const startsA = await pollUntil(async () => + (await countByName([a.sessionId], 'session_start')) > 0 ? true : null + ); + check('clickhouse: session_start for first session', !!startsA); + const startsB = await pollUntil(async () => + (await countByName([b.sessionId], 'session_start')) > 0 ? true : null + ); + check('clickhouse: session_start for second session', !!startsB); + const endsA = await pollUntil(async () => + (await countByName([a.sessionId], 'session_end')) > 0 ? true : null + ); + check('clickhouse: first session got a session_end', !!endsA); +} + +async function scenarioNonScreenViewFirst() { + scenario('session opened by a non-screen_view event'); + const ip = `10.${(runId >> 16) & 255}.5.${runId & 255 || 1}`; + + // First (and only) event is a custom event with no __path. + const { deviceId, sessionId } = await track( + { type: 'track', payload: { name: 'purchase', properties: { __ip: ip } } }, + ip + ); + check('track returned a sessionId for a non-screen_view first event', !!sessionId); + + const starts = await pollUntil(async () => + (await countByName([sessionId], 'session_start')) > 0 ? true : null + ); + check('clickhouse: session_start emitted for a custom-event session', !!starts); + + console.log(` …waiting ${IDLE_WAIT_MS}ms for idle window, then reaping`); + await sleep(IDLE_WAIT_MS); + await triggerReaper(); + + const ends = await pollUntil(async () => + (await countByName([sessionId], 'session_end')) > 0 ? true : null + ); + check('clickhouse: session_end emitted (closes normally)', !!ends); + + const row = await pollUntil(async () => { + const rows = await chQuery<{ + screen_view_count: number; + event_count: number; + is_bounce: boolean; + }>( + `SELECT screen_view_count, event_count, is_bounce FROM sessions FINAL WHERE project_id = '${PROJECT_ID}' AND id = '${sessionId}' LIMIT 1` + ); + return rows[0] ?? null; + }); + check('clickhouse: screen_view_count is 0', Number(row?.screen_view_count) === 0, `${row?.screen_view_count}`); + check('clickhouse: event_count is 1', Number(row?.event_count) === 1, `${row?.event_count}`); + check( + 'clickhouse: is_bounce is true (no pageviews)', + row?.is_bounce === true || Number(row?.is_bounce) === 1, + `is_bounce=${row?.is_bounce}` + ); + + const cleaned = await pollUntil(async () => + (await redis.get(sessionKey(deviceId))) === null ? true : null + ); + check('redis: session cleaned up after close', !!cleaned); +} + +async function scenarioReplay() { + scenario('replay chunk files under the echoed session id'); + const ip = `10.${(runId >> 16) & 255}.3.${runId & 255 || 1}`; + + const { sessionId } = await screenView(ip, '/replay'); + await track( + { + type: 'replay', + payload: { + sessionId, + chunk_index: 0, + events_count: 1, + is_full_snapshot: true, + started_at: new Date(runId).toISOString(), + ended_at: new Date(runId + 1000).toISOString(), + payload: '[]', + }, + }, + ip + ); + + const found = await pollUntil(async () => { + const rows = await chQuery<{ c: string }>( + `SELECT count() AS c FROM session_replay_chunks WHERE project_id = '${PROJECT_ID}' AND session_id = '${sessionId}'` + ); + return Number(rows[0]?.c ?? 0) > 0 ? true : null; + }); + check('clickhouse: replay chunk stored under the session id', !!found); +} + +async function scenarioIdentify() { + scenario('identified visit → profile index + profile_id on events'); + const ip = `10.${(runId >> 16) & 255}.4.${runId & 255 || 1}`; + const profileId = `e2e-user-${runId}`; + + const { deviceId, sessionId } = await screenView(ip, '/account'); + await track( + { + type: 'track', + payload: { name: 'signed_in', profileId, properties: { __ip: ip } }, + }, + ip + ); + + const ptr = await pollUntil(async () => + (await redis.get(profileKey(profileId))) === deviceId ? true : null + ); + check('redis: profile→device pointer written', !!ptr); + + const profiled = await pollUntil(async () => { + const rows = await chQuery<{ c: string }>( + `SELECT count() AS c FROM events WHERE project_id = '${PROJECT_ID}' AND session_id = '${sessionId}' AND profile_id = '${profileId}'` + ); + return Number(rows[0]?.c ?? 0) > 0 ? true : null; + }); + check('clickhouse: event carries the identified profile_id', !!profiled); +} + +async function main() { + console.log( + `Session E2E — api=${API_URL} worker=${WORKER_URL} timeout=${SESSION_TIMEOUT_MS}ms project=${PROJECT_ID}` + ); + await preflight(); + await ensureFixtures(); + await scenarioSingleSession(); + await scenarioBoundary(); + await scenarioNonScreenViewFirst(); + await scenarioReplay(); + await scenarioIdentify(); + await shutdown(summarize()); +} + +main().catch(async (error) => { + console.error('\nFATAL:', error); + await shutdown(1); +}); diff --git a/apps/api/e2e/session-stress.ts b/apps/api/e2e/session-stress.ts new file mode 100644 index 000000000..907a90d20 --- /dev/null +++ b/apps/api/e2e/session-stress.ts @@ -0,0 +1,235 @@ +/** + * Session stress + drain-to-completion. + * + * Ramps out many short sessions (default 500, not all at once), then drives the + * reaper + buffer flushes until EVERYTHING has fully drained — every session_end + * emitted, Redis cleaned, the session buffer empty — and reconciles the + * ClickHouse counts. Exits 0 only when nothing is left open. + * + * Run (shrink the idle window; start the stack with the SAME value): + * SESSION_TIMEOUT_MS=4000 pnpm dev + * SESSION_TIMEOUT_MS=4000 pnpm --filter @openpanel/api e2e:sessions:stress + * + * Tunables (env): E2E_SESSIONS (500), E2E_CONCURRENCY (25), + * E2E_EVENTS_PER_SESSION (3), E2E_DRAIN_TIMEOUT_MS (120000). + */ + +import { + API_URL, + check, + chQuery, + countByName, + ensureFixtures, + getBlob, + IDLE_WAIT_MS, + pollUntil, + preflight, + PROJECT_ID, + redis, + runId, + runPool, + scenario, + screenView, + SESSION_BUFFER_LIST, + SESSION_TIMEOUT_MS, + shutdown, + sleep, + summarize, + track, + triggerCron, + triggerReaper, + wallclockKey, + WORKER_URL, +} from './lib'; + +const SESSIONS = Number.parseInt(process.env.E2E_SESSIONS || '500', 10); +const CONCURRENCY = Number.parseInt(process.env.E2E_CONCURRENCY || '25', 10); +const EVENTS_PER_SESSION = Number.parseInt( + process.env.E2E_EVENTS_PER_SESSION || '3', + 10 +); +const DRAIN_TIMEOUT_MS = Number.parseInt( + process.env.E2E_DRAIN_TIMEOUT_MS || '120000', + 10 +); + +type Session = { sessionId: string; deviceId: string }; + +// Unique IP per session → unique device → unique session. Namespaced by runId +// so reruns and the correctness harness (10.x) never collide. +const ipForSession = (i: number) => + `100.${(runId >> 8) & 255}.${(i >> 8) & 255}.${i & 255}`; + +// A realistic-ish mix: events are NOT all screen_views. +const CUSTOM_EVENTS = [ + 'sign_up', + 'login', + 'add_to_cart', + 'purchase', + 'search', + 'button_click', + 'video_play', + 'share', +]; +const WORDS = ['alpha', 'beta', 'gamma', 'sverige', 'katt', 'planet']; +const pick = (arr: T[]): T => arr[Math.floor(Math.random() * arr.length)]!; + +function randomProps(ip: string): Record { + const props: Record = { __ip: ip }; + const n = Math.floor(Math.random() * 4); // 0–3 extra properties + for (let k = 0; k < n; k++) { + props[`prop_${pick(WORDS)}`] = + Math.random() < 0.5 ? pick(WORDS) : Math.floor(Math.random() * 1000); + } + return props; +} + +// One event in a session: the entry is always a screen_view; the rest are a +// 50/50 mix of more screen_views and random custom events. +function sendSessionEvent(ip: string, i: number, e: number) { + if (e === 0 || Math.random() < 0.5) { + return screenView(ip, `/p${i}/${e}`, randomProps(ip)); + } + return track( + { type: 'track', payload: { name: pick(CUSTOM_EVENTS), properties: randomProps(ip) } }, + ip + ); +} + +async function emit(): Promise { + scenario(`emit ${SESSIONS} sessions × ${EVENTS_PER_SESSION} events (concurrency ${CONCURRENCY})`); + const sessions: Session[] = []; + let done = 0; + let errors = 0; + const started = Date.now(); + + await runPool( + Array.from({ length: SESSIONS }, (_, i) => i), + CONCURRENCY, + async (i) => { + const ip = ipForSession(i); + try { + const first = await sendSessionEvent(ip, i, 0); // entry is a screen_view + for (let e = 1; e < EVENTS_PER_SESSION; e++) { + await sendSessionEvent(ip, i, e); + } + sessions.push({ sessionId: first.sessionId, deviceId: first.deviceId }); + } catch (error) { + errors++; + if (errors <= 3) console.warn(` session ${i} failed: ${(error as Error).message}`); + } + done++; + if (done % 100 === 0 || done === SESSIONS) { + console.log(` …emitted ${done}/${SESSIONS}`); + } + } + ); + + const uniq = new Set(sessions.map((s) => s.sessionId)).size; + check('all sessions sent without HTTP errors', errors === 0, `${errors} errors`); + check( + 'each session got a distinct session id', + uniq === SESSIONS, + `${uniq} unique / ${sessions.length} ok / ${SESSIONS} sent` + ); + console.log(` emit took ${((Date.now() - started) / 1000).toFixed(1)}s`); + return sessions; +} + +async function settle(sessions: Session[]) { + scenario('settle: all sessions opened in ClickHouse'); + const ids = sessions.map((s) => s.sessionId); + const starts = await pollUntil( + async () => { + const c = await countByName(ids, 'session_start'); + console.log(` …session_start ${c}/${ids.length}`); + return c >= ids.length ? c : null; + }, + { timeoutMs: 60_000, intervalMs: 2000 } + ); + check('clickhouse: one session_start per session', starts === ids.length, `got ${starts}`); +} + +async function drain(sessions: Session[]) { + scenario('drain: reap + flush until nothing remains'); + console.log(` …waiting ${IDLE_WAIT_MS}ms for all sessions to idle out`); + await sleep(IDLE_WAIT_MS); + + const ids = sessions.map((s) => s.sessionId); + const target = ids.length; + const deadline = Date.now() + DRAIN_TIMEOUT_MS; + let ends = 0; + let remaining = Number.POSITIVE_INFINITY; + + while (Date.now() < deadline) { + await triggerReaper(); // enqueue session_end for idle sessions + await sleep(1000); // let the worker process the close jobs + await triggerCron('flushEvents'); // push session_start/end + events → CH + await triggerCron('flushSessions'); // push sessions table rows → CH + + ends = await countByName(ids, 'session_end'); + remaining = await redis.zcard(wallclockKey); + console.log(` …session_end ${ends}/${target}, wallclock remaining ${remaining}`); + if (ends >= target && remaining === 0) break; + await sleep(1500); + } + + check('clickhouse: one session_end per session', ends === target, `got ${ends}`); + check('redis: wallclock index fully drained', remaining === 0, `${remaining} left`); + + const bufLen = await redis.llen(SESSION_BUFFER_LIST); + check('redis: session buffer drained to ClickHouse', bufLen === 0, `${bufLen} rows pending`); + + // Spot-check a sample of devices: no session blob should linger after close. + const sample = sessions.filter((_, k) => k % Math.ceil(sessions.length / 20) === 0); + let leaked = 0; + for (const s of sample) { + if (await getBlob(s.deviceId)) leaked++; + } + check('redis: no session blobs leaked (sampled)', leaked === 0, `${leaked}/${sample.length} leaked`); +} + +async function reconcile(sessions: Session[]) { + scenario('reconcile: ClickHouse event counts'); + const ids = sessions.map((s) => s.sessionId); + const n = ids.length; + const inList = ids.map((id) => `'${id}'`).join(','); + const rows = await chQuery<{ name: string; c: string }>( + `SELECT name, count() AS c FROM events WHERE project_id = '${PROJECT_ID}' AND session_id IN (${inList}) GROUP BY name ORDER BY c DESC` + ); + const byName = new Map(rows.map((r) => [r.name, Number(r.c)])); + const total = [...byName.values()].reduce((a, b) => a + b, 0); + console.log(` event breakdown: ${rows.map((r) => `${r.name}=${r.c}`).join(' ')}`); + + const starts = byName.get('session_start') ?? 0; + const ends = byName.get('session_end') ?? 0; + // Each session = N events we sent + the synthetic session_start + session_end. + check('session_start == sessions', starts === n, `${starts} vs ${n}`); + check('session_end == sessions', ends === n, `${ends} vs ${n}`); + check( + 'total events == sessions × (events + start + end)', + total === n * (EVENTS_PER_SESSION + 2), + `${total} vs ${n * (EVENTS_PER_SESSION + 2)}` + ); +} + +async function main() { + console.log( + `Session STRESS — api=${API_URL} worker=${WORKER_URL} timeout=${SESSION_TIMEOUT_MS}ms ` + + `sessions=${SESSIONS} events=${EVENTS_PER_SESSION} concurrency=${CONCURRENCY}` + ); + await preflight(); + await ensureFixtures(); + + const sessions = await emit(); + await settle(sessions); + await drain(sessions); + await reconcile(sessions); + + await shutdown(summarize()); +} + +main().catch(async (error) => { + console.error('\nFATAL:', error); + await shutdown(1); +}); diff --git a/apps/api/package.json b/apps/api/package.json index fc6ef7d9a..d3c73d376 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -9,6 +9,8 @@ "build": "rm -rf dist && tsdown", "gen:bots": "jiti scripts/get-bots.ts && biome format --write src/bots/bots.ts", "test:manage": "jiti scripts/test-manage-api.ts", + "e2e:sessions": "LOG_LEVEL=warn dotenv -e ../../.env -- jiti e2e/session-e2e.ts", + "e2e:sessions:stress": "LOG_LEVEL=warn dotenv -e ../../.env -- jiti e2e/session-stress.ts", "test": "vitest", "test:run": "vitest run", "typecheck": "tsc --noEmit" diff --git a/apps/api/src/utils/ids.test.ts b/apps/api/src/utils/ids.test.ts index c1a4cf43c..e6d8e952c 100644 --- a/apps/api/src/utils/ids.test.ts +++ b/apps/api/src/utils/ids.test.ts @@ -1,91 +1,74 @@ /** - * Tests for getDeviceId — specifically the caller-supplied (override) device id - * path. The override must resolve a real, stable sessionId through the same - * `getInfoFromSession` logic internal device ids use, so sessions open/extend/close - * identically. Returning an empty sessionId here is the bug that left sessions - * unbuffered and made the createSessionEnd job throw "Session not found". + * Tests for getDeviceId's session resolution against the device-keyed session + * store. The id is reused from the live blob only while it's within the idle + * window; a lingering (past-window) blob must NOT be reused — see the guard in + * ids.ts. Override device ids resolve through the same path with a single read. * - * Redis is mocked: we drive `multi().exec()` to simulate "an active sessionEnd job - * exists" vs "no active job" without a real connection. + * We spy on the real `sessionBuffer.getExistingSession` singleton (no Redis), + * and use the real `formatClickhouseDate` so `ended_at` round-trips exactly. */ -import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; - -const { execMock } = vi.hoisted(() => ({ execMock: vi.fn() })); - -vi.mock('@openpanel/redis', async (importOriginal) => { - const actual = await importOriginal(); - return { - ...actual, - getRedisCache: () => ({ - multi: () => { - const chain = { - hget: () => chain, - exec: execMock, - }; - return chain; - }, - }), - }; -}); - -const { getDeviceId } = await import('./ids'); +import type { IClickhouseSession } from '@openpanel/db'; +import { formatClickhouseDate, sessionBuffer } from '@openpanel/db'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { getDeviceId } from './ids'; +const NOW = new Date('2026-06-08T12:00:00.000Z').getTime(); +const MINUTE = 60 * 1000; const SALTS = { current: 'salt-current', previous: 'salt-previous' }; const BASE = { projectId: 'proj-1', ip: '1.2.3.4', ua: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/148.0.0.0', salts: SALTS, + eventTimeMs: NOW, }; -beforeEach(() => { - vi.useFakeTimers(); - vi.setSystemTime(new Date('2026-06-08T12:00:00.000Z')); - execMock.mockReset(); -}); +// withinIdleWindow only reads `id` + `ended_at`; the rest is irrelevant here. +const fakeSession = (id: string, endedAtMs: number): IClickhouseSession => + ({ + id, + ended_at: formatClickhouseDate(new Date(endedAtMs)), + }) as unknown as IClickhouseSession; afterEach(() => { - vi.useRealTimers(); + vi.restoreAllMocks(); }); -describe('getDeviceId — override device id', () => { - it('derives a non-empty, deterministic sessionId when no active session exists', async () => { - execMock.mockResolvedValue([ - [null, null], - [null, null], - ]); +describe('getDeviceId — session resolution', () => { + it('mints a deterministic, stable id when no session exists', async () => { + const spy = vi + .spyOn(sessionBuffer, 'getExistingSession') + .mockResolvedValue(null); - const result = await getDeviceId({ - ...BASE, - overrideDeviceId: 'cookie-abc', - }); + const a = await getDeviceId({ ...BASE, overrideDeviceId: 'cookie-abc' }); + const b = await getDeviceId({ ...BASE, overrideDeviceId: 'cookie-abc' }); - expect(result.deviceId).toBe('cookie-abc'); - expect(result.sessionId).toBeTruthy(); + expect(a.sessionId).toBeTruthy(); + expect(b.sessionId).toBe(a.sessionId); // same window → same id + expect(a.deviceId).toBe('cookie-abc'); + expect(spy).toHaveBeenCalled(); + }); - // Same override within the same time window → same session id (stable). - const again = await getDeviceId({ + it('reuses the live session id when within the idle window', async () => { + vi.spyOn(sessionBuffer, 'getExistingSession').mockResolvedValue( + fakeSession('sess-live', NOW - MINUTE) + ); + + const result = await getDeviceId({ ...BASE, overrideDeviceId: 'cookie-abc', }); - expect(again.sessionId).toBe(result.sessionId); + + expect(result).toEqual({ deviceId: 'cookie-abc', sessionId: 'sess-live' }); }); - it('reuses the sessionId from an active sessionEnd job instead of splitting', async () => { - execMock.mockResolvedValue([ - [ - null, - JSON.stringify({ - payload: { - projectId: 'proj-1', - deviceId: 'cookie-abc', - sessionId: 'sess-active-123', - }, - }), - ], - [null, null], - ]); + it('does NOT reuse a session that has lingered past the idle window', async () => { + vi.spyOn(sessionBuffer, 'getExistingSession').mockResolvedValue( + // ended 31 min ago — past the 30 min timeout; the reaper just hasn't + // closed it yet (blobs have no TTL). + fakeSession('sess-stale', NOW - 31 * MINUTE) + ); const result = await getDeviceId({ ...BASE, @@ -93,20 +76,35 @@ describe('getDeviceId — override device id', () => { }); expect(result.deviceId).toBe('cookie-abc'); - expect(result.sessionId).toBe('sess-active-123'); + expect(result.sessionId).toBeTruthy(); + expect(result.sessionId).not.toBe('sess-stale'); // a fresh id, not the stale one }); - it('produces distinct session ids for distinct override device ids', async () => { - execMock.mockResolvedValue([ - [null, null], - [null, null], - ]); + it('reads the store once for an override (no redundant previous lookup)', async () => { + const spy = vi + .spyOn(sessionBuffer, 'getExistingSession') + .mockResolvedValue(null); - const a = await getDeviceId({ ...BASE, overrideDeviceId: 'cookie-a' }); - const b = await getDeviceId({ ...BASE, overrideDeviceId: 'cookie-b' }); + await getDeviceId({ ...BASE, overrideDeviceId: 'cookie-abc' }); - expect(a.sessionId).toBeTruthy(); - expect(b.sessionId).toBeTruthy(); - expect(a.sessionId).not.toBe(b.sessionId); + expect(spy).toHaveBeenCalledTimes(1); + expect(spy).toHaveBeenCalledWith({ + projectId: 'proj-1', + deviceId: 'cookie-abc', + }); + }); + + it('checks both current and previous salt windows for internal ids', async () => { + const spy = vi + .spyOn(sessionBuffer, 'getExistingSession') + .mockResolvedValue(null); + + await getDeviceId(BASE); // no override → IP+UA hashing + + expect(spy).toHaveBeenCalledTimes(2); + const deviceIds = spy.mock.calls.map((c) => + 'deviceId' in c[0] ? c[0].deviceId : '' + ); + expect(new Set(deviceIds).size).toBe(2); // distinct current/previous hashes }); }); diff --git a/apps/api/src/utils/ids.ts b/apps/api/src/utils/ids.ts index dfc45551b..21611ac38 100644 --- a/apps/api/src/utils/ids.ts +++ b/apps/api/src/utils/ids.ts @@ -25,12 +25,12 @@ export async function getDeviceId({ eventTimeMs?: number; }) { if (overrideDeviceId) { - // Resolve a caller-supplied device id through the same path as internal ones - // (stable, so it's both candidates) — sessions open/extend/close identically. + // A caller-supplied device id is stable (no salt rotation), so it's the only + // candidate — resolve it through the same path as internal ids. return await getInfoFromSession({ projectId, - currentDeviceId: overrideDeviceId, - previousDeviceId: overrideDeviceId, + deviceIds: [overrideDeviceId], + eventTimeMs: eventTimeMs ?? Date.now(), }); } @@ -53,8 +53,7 @@ export async function getDeviceId({ return await getInfoFromSession({ projectId, - currentDeviceId, - previousDeviceId, + deviceIds: [currentDeviceId, previousDeviceId], eventTimeMs: eventTimeMs ?? Date.now(), }); } @@ -84,45 +83,54 @@ function withinIdleWindow( async function getInfoFromSession({ projectId, - currentDeviceId, - previousDeviceId, + deviceIds, eventTimeMs, }: { projectId: string; - currentDeviceId: string; - previousDeviceId: string; + /** Candidate device ids in priority order (e.g. [current, previous] salt + * windows, or just [override]). Deduped; the first is canonical. */ + deviceIds: string[]; eventTimeMs: number; }): Promise { + const candidates = [...new Set(deviceIds.filter(Boolean))]; + const primary = candidates[0] ?? ''; + try { - const [current, previous] = await Promise.all([ - sessionBuffer.getExistingSession({ - projectId, - deviceId: currentDeviceId, - }), - sessionBuffer.getExistingSession({ - projectId, - deviceId: previousDeviceId, - }), - ]); - - if (current && withinIdleWindow(current, eventTimeMs)) { - return { deviceId: currentDeviceId, sessionId: current.id }; - } - if (previous && withinIdleWindow(previous, eventTimeMs)) { - return { deviceId: previousDeviceId, sessionId: previous.id }; + // Reading the live blob is the source of truth for an active session — it's + // what keeps a visit on one id across page reloads and salt rotation. Don't + // drop this read to "save" a lookup or sessions split at the bucket boundary. + const sessions = await Promise.all( + candidates.map((deviceId) => + sessionBuffer.getExistingSession({ projectId, deviceId }) + ) + ); + + for (const [i, session] of sessions.entries()) { + if (session && withinIdleWindow(session, eventTimeMs)) { + return { deviceId: candidates[i]!, sessionId: session.id }; + } } } catch (error) { - console.error('Error getting session GET /track/device-id', error); + console.error('Error resolving session for device id', error); } return { - deviceId: currentDeviceId, + deviceId: primary, + // Deterministic id for the first event of a session and to bridge the window + // before the worker persists the blob (API resolves synchronously, worker + // writes async — same bucket → same id, so they agree). + // + // The bucket window MUST track the idle timeout: a gap > the window has to + // land in a new bucket so a boundary mints a *fresh* id. If it didn't (e.g. + // a hardcoded 30min while SESSION_TIMEOUT_MS is shorter), the worker would + // reopen the just-closed id and its session_end would be skipped. Grace must + // stay < window or getSessionId throws. sessionId: getSessionId({ projectId, - deviceId: currentDeviceId, + deviceId: primary, eventMs: eventTimeMs, - graceMs: 5 * 1000, - windowMs: 1000 * 60 * 30, + graceMs: Math.min(5_000, Math.floor(SESSION_TIMEOUT_MS / 6)), + windowMs: SESSION_TIMEOUT_MS, }), }; } diff --git a/apps/worker/src/boot-debug.ts b/apps/worker/src/boot-debug.ts index 58a7d6a84..a95c5cac1 100644 --- a/apps/worker/src/boot-debug.ts +++ b/apps/worker/src/boot-debug.ts @@ -18,6 +18,8 @@ const CRON_TYPES = [ 'onboarding', 'gscSync', 'cohortRefresh', + 'sessionReaper', + 'sessionVacuum', ] as const satisfies readonly CronQueueType[]; function escapeHtml(value: string) { diff --git a/apps/worker/src/jobs/cron.session-reaper.test.ts b/apps/worker/src/jobs/cron.session-reaper.test.ts new file mode 100644 index 000000000..9d6423f40 --- /dev/null +++ b/apps/worker/src/jobs/cron.session-reaper.test.ts @@ -0,0 +1,119 @@ +/** + * Tests for the session reaper — the wall-clock deadman that closes idle + * sessions. Redis is mocked (we drive the wallclock ZSET + lock), sessionBuffer + * and enqueueSessionEndV2 are mocked so we assert decisions, not side effects. + */ + +import type { IClickhouseSession } from '@openpanel/db'; +import { sessionBuffer } from '@openpanel/db'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { enqueueSessionEndV2 } from '@/utils/session-handler'; +import { sessionReaperCronJob } from './cron.session-reaper'; + +const { redisMock } = vi.hoisted(() => ({ + redisMock: { + smembers: vi.fn(), + set: vi.fn(), + del: vi.fn(), + zrangebyscore: vi.fn(), + zcard: vi.fn(), + srem: vi.fn(), + zrem: vi.fn(), + }, +})); + +vi.mock('@openpanel/redis', async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, getRedisCache: () => redisMock }; +}); + +vi.mock('@openpanel/db', async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, sessionBuffer: { getExistingSession: vi.fn() } }; +}); + +vi.mock('@/utils/session-handler', () => ({ + enqueueSessionEndV2: vi.fn().mockResolvedValue(undefined), +})); + +// Concrete metric stubs — avoid loading the real metrics module (it registers +// prom gauges at import) and avoid a Proxy (its `then` trap confuses ESM interop). +vi.mock('@/metrics', () => { + const counter = () => ({ inc: vi.fn(), observe: vi.fn(), set: vi.fn() }); + return { + sessionEndsEnqueued: counter(), + sessionsReaped: counter(), + sessionsReaperOrphans: counter(), + }; +}); + +const getExistingSession = vi.mocked(sessionBuffer.getExistingSession); + +const session = (id: string, deviceId: string): IClickhouseSession => + ({ + id, + project_id: 'proj-1', + device_id: deviceId, + profile_id: deviceId, + ended_at: '2026-06-08 11:00:00', + duration: 1000, + groups: [], + }) as unknown as IClickhouseSession; + +beforeEach(() => { + vi.clearAllMocks(); + redisMock.smembers.mockResolvedValue(['proj-1']); + redisMock.set.mockResolvedValue('OK'); // lock acquired + redisMock.del.mockResolvedValue(1); + redisMock.zcard.mockResolvedValue(0); // project drained after reap + redisMock.srem.mockResolvedValue(1); + redisMock.zrem.mockResolvedValue(1); + redisMock.zrangebyscore.mockResolvedValue([]); +}); + +describe('sessionReaperCronJob', () => { + it('enqueues a session_end for an idle session blob', async () => { + redisMock.zrangebyscore.mockResolvedValue(['dev-1']); + getExistingSession.mockResolvedValue(session('sess-1', 'dev-1')); + + await sessionReaperCronJob(); + + expect(enqueueSessionEndV2).toHaveBeenCalledTimes(1); + expect( + vi.mocked(enqueueSessionEndV2).mock.calls[0]![0].closedSession.id + ).toBe('sess-1'); + // wallclock index emptied → project removed from the active set + expect(redisMock.srem).toHaveBeenCalledWith('session:projects', 'proj-1'); + }); + + it('ZREMs an orphan (wallclock entry with no blob) and does not enqueue', async () => { + redisMock.zrangebyscore.mockResolvedValue(['dev-orphan']); + getExistingSession.mockResolvedValue(null); + + await sessionReaperCronJob(); + + expect(enqueueSessionEndV2).not.toHaveBeenCalled(); + expect(redisMock.zrem).toHaveBeenCalledWith( + 'session:wallclock:proj-1', + 'dev-orphan' + ); + }); + + it('skips a project when the advisory lock is held', async () => { + redisMock.set.mockResolvedValue(null); // NX failed → another pod owns it + + await sessionReaperCronJob(); + + expect(redisMock.zrangebyscore).not.toHaveBeenCalled(); + expect(enqueueSessionEndV2).not.toHaveBeenCalled(); + }); + + it('is a no-op when disabled via SESSION_REAPER=0', async () => { + vi.stubEnv('SESSION_REAPER', '0'); + + await sessionReaperCronJob(); + + expect(redisMock.smembers).not.toHaveBeenCalled(); + vi.unstubAllEnvs(); + }); +}); diff --git a/apps/worker/src/jobs/cron.session-vacuum.test.ts b/apps/worker/src/jobs/cron.session-vacuum.test.ts new file mode 100644 index 000000000..419dcdfc9 --- /dev/null +++ b/apps/worker/src/jobs/cron.session-vacuum.test.ts @@ -0,0 +1,90 @@ +/** + * Tests for the daily vacuum — the backstop for sessions whose cleanup() leaked + * (worker crash mid-MULTI). For wallclock entries older than the stale threshold: + * a lingering blob → id-gated cleanup(); a missing blob → ZREM the orphan. + */ + +import type { IClickhouseSession } from '@openpanel/db'; +import { sessionBuffer } from '@openpanel/db'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { sessionVacuumCronJob } from './cron.session-vacuum'; + +const { redisMock } = vi.hoisted(() => ({ + redisMock: { + smembers: vi.fn(), + zrangebyscore: vi.fn(), + zrem: vi.fn(), + }, +})); + +vi.mock('@openpanel/redis', async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, getRedisCache: () => redisMock }; +}); + +vi.mock('@openpanel/db', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + sessionBuffer: { getExistingSession: vi.fn(), cleanup: vi.fn() }, + }; +}); + +vi.mock('@/metrics', () => ({ + sessionsVacuumed: { inc: vi.fn() }, +})); + +const getExistingSession = vi.mocked(sessionBuffer.getExistingSession); +const cleanup = vi.mocked(sessionBuffer.cleanup); + +beforeEach(() => { + vi.clearAllMocks(); + redisMock.smembers.mockResolvedValue(['proj-1']); + redisMock.zrem.mockResolvedValue(1); + redisMock.zrangebyscore.mockResolvedValue([]); + cleanup.mockResolvedValue(undefined as never); +}); + +describe('sessionVacuumCronJob', () => { + it('id-gated cleanup() for a stale blob that lingered', async () => { + redisMock.zrangebyscore.mockResolvedValue(['dev-stale']); + getExistingSession.mockResolvedValue({ + id: 'sess-stale', + project_id: 'proj-1', + device_id: 'dev-stale', + profile_id: 'dev-stale', + } as unknown as IClickhouseSession); + + await sessionVacuumCronJob(); + + expect(cleanup).toHaveBeenCalledWith({ + projectId: 'proj-1', + deviceId: 'dev-stale', + sessionId: 'sess-stale', + profileId: 'dev-stale', + }); + expect(redisMock.zrem).not.toHaveBeenCalled(); + }); + + it('ZREMs an orphan wallclock entry when the blob is gone', async () => { + redisMock.zrangebyscore.mockResolvedValue(['dev-orphan']); + getExistingSession.mockResolvedValue(null); + + await sessionVacuumCronJob(); + + expect(cleanup).not.toHaveBeenCalled(); + expect(redisMock.zrem).toHaveBeenCalledWith( + 'session:wallclock:proj-1', + 'dev-orphan' + ); + }); + + it('is a no-op when disabled via SESSION_VACUUM=0', async () => { + vi.stubEnv('SESSION_VACUUM', '0'); + + await sessionVacuumCronJob(); + + expect(redisMock.smembers).not.toHaveBeenCalled(); + vi.unstubAllEnvs(); + }); +}); diff --git a/apps/worker/src/jobs/cron.ts b/apps/worker/src/jobs/cron.ts index b00a6ddc0..43504b361 100644 --- a/apps/worker/src/jobs/cron.ts +++ b/apps/worker/src/jobs/cron.ts @@ -20,7 +20,7 @@ import { insightsDailyJob } from './insights'; import { logger } from '@/utils/logger'; export async function cronJob(job: Job) { - logger.info(`Cron job started - ${job.data.type}`); + logger.debug(`Cron job started - ${job.data.type}`); switch (job.data.type) { case 'salt': { return await salt(); diff --git a/apps/worker/src/jobs/events.create-session-end.test.ts b/apps/worker/src/jobs/events.create-session-end.test.ts new file mode 100644 index 000000000..cdb71d1d2 --- /dev/null +++ b/apps/worker/src/jobs/events.create-session-end.test.ts @@ -0,0 +1,153 @@ +/** + * Tests for createSessionEnd — the close job. Covers the three-case logic + * (extended-after-enqueue skip, live vs snapshot, boundary), the first-writer + * idempotency claim, and that cleanup() is id-gated to the closed session. + */ + +import { + createEvent, + type IClickhouseSession, + type IServiceCreateEventPayload, + sessionBuffer, +} from '@openpanel/db'; +import type { EventsQueuePayloadCreateSessionEnd } from '@openpanel/queue'; +import type { Job } from 'bullmq'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { createSessionEnd } from './events.create-session-end'; + +const { redisMock } = vi.hoisted(() => ({ + redisMock: { set: vi.fn() }, +})); + +vi.mock('@openpanel/redis', async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, getRedisCache: () => redisMock }; +}); + +vi.mock('@openpanel/db', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + createEvent: vi.fn().mockResolvedValue({ document: { id: 'evt-1' } }), + getNotificationRulesByProjectId: vi.fn().mockResolvedValue([]), + getHasFunnelRules: vi.fn().mockReturnValue(false), + transformEvent: vi.fn((e) => e), + sessionBuffer: { getExistingSession: vi.fn(), cleanup: vi.fn() }, + }; +}); + +vi.mock('@/metrics', () => { + const m = () => ({ inc: vi.fn(), observe: vi.fn(), set: vi.fn() }); + return { + sessionDurationOnClose: m(), + sessionEndsEmitted: m(), + sessionEndsSkipped: m(), + sessionEventsOnClose: m(), + }; +}); + +const getExistingSession = vi.mocked(sessionBuffer.getExistingSession); +const cleanup = vi.mocked(sessionBuffer.cleanup); +const createEventMock = vi.mocked(createEvent); + +const snapshot: IClickhouseSession = { + id: 'sess-1', + project_id: 'proj-1', + device_id: 'dev-1', + profile_id: 'dev-1', + created_at: '2026-06-08 10:30:00', + ended_at: '2026-06-08 11:00:00', + is_bounce: false, + duration: 1_800_000, + event_count: 3, + screen_view_count: 5, + exit_path: '/end', +} as unknown as IClickhouseSession; + +const makeJob = ( + snap: IClickhouseSession +): Job => + ({ + id: 'job-1', + data: { + type: 'createSessionEnd', + payload: { + projectId: 'proj-1', + deviceId: 'dev-1', + profileId: 'dev-1', + name: 'session_end', + properties: {}, + } as unknown as IServiceCreateEventPayload, + snapshot: snap, + }, + }) as unknown as Job; + +beforeEach(() => { + vi.clearAllMocks(); + redisMock.set.mockResolvedValue('OK'); // claim acquired by default + cleanup.mockResolvedValue(undefined as never); + createEventMock.mockResolvedValue({ document: { id: 'evt-1' } } as never); +}); + +describe('createSessionEnd', () => { + it('emits session_end and id-gated cleanup on the happy path', async () => { + getExistingSession.mockResolvedValue(snapshot); // live == snapshot, unchanged + + await createSessionEnd(makeJob(snapshot)); + + expect(createEventMock).toHaveBeenCalledTimes(1); + expect(createEventMock.mock.calls[0]![0]).toMatchObject({ + name: 'session_end', + sessionId: 'sess-1', + }); + expect(cleanup).toHaveBeenCalledWith({ + projectId: 'proj-1', + deviceId: 'dev-1', + sessionId: 'sess-1', + profileId: 'dev-1', + }); + }); + + it('skips when the session was extended after the close was enqueued', async () => { + getExistingSession.mockResolvedValue({ + ...snapshot, + ended_at: '2026-06-08 11:05:00', // later than snapshot → still active + }); + + const result = await createSessionEnd(makeJob(snapshot)); + + expect(result).toBeNull(); + expect(createEventMock).not.toHaveBeenCalled(); + expect(cleanup).not.toHaveBeenCalled(); + }); + + it('skips when the idempotency claim is already taken', async () => { + getExistingSession.mockResolvedValue(snapshot); + redisMock.set.mockResolvedValue(null); // SET NX failed → already emitted + + const result = await createSessionEnd(makeJob(snapshot)); + + expect(result).toBeNull(); + expect(createEventMock).not.toHaveBeenCalled(); + }); + + it('uses the snapshot when a new session already owns the slot (boundary)', async () => { + // A boundary opened a fresh session under the same (project, device) slot. + getExistingSession.mockResolvedValue({ + ...snapshot, + id: 'sess-2', + ended_at: '2026-06-08 12:00:00', + }); + + await createSessionEnd(makeJob(snapshot)); + + // Emits for the CLOSED session (snapshot), not the live one. + expect(createEventMock.mock.calls[0]![0]).toMatchObject({ + sessionId: 'sess-1', + }); + // cleanup is keyed on the closed id → id-gated Lua no-ops against sess-2. + expect(cleanup).toHaveBeenCalledWith( + expect.objectContaining({ sessionId: 'sess-1' }) + ); + }); +}); diff --git a/apps/worker/src/jobs/events.incoming-events.test.ts b/apps/worker/src/jobs/events.incoming-events.test.ts index a9edb9dc7..abb22e080 100644 --- a/apps/worker/src/jobs/events.incoming-events.test.ts +++ b/apps/worker/src/jobs/events.incoming-events.test.ts @@ -22,6 +22,10 @@ vi.mock('@openpanel/db', async () => { getProjectByIdCached: vi.fn().mockResolvedValue({ filters: [] }), matchEvent: vi.fn().mockReturnValue(false), sessionBuffer: { + // name/getBufferSize keep metrics.ts happy (it registers a per-buffer + // gauge at import); the tests only drive getExistingSession/ingest. + name: 'session', + getBufferSize: vi.fn().mockResolvedValue(0), getExistingSession: vi.fn(), ingest: vi.fn(), }, diff --git a/apps/worker/src/metrics.ts b/apps/worker/src/metrics.ts index d96308e2d..2f595e8a0 100644 --- a/apps/worker/src/metrics.ts +++ b/apps/worker/src/metrics.ts @@ -293,16 +293,9 @@ for (const buf of allBuffers) { }; } -register.registerMetric( - new client.Gauge({ - name: `buffer_${replayBuffer.name}_count`, - help: 'Number of unprocessed replay chunks', - async collect() { - const metric = await replayBuffer.getBufferSize(); - this.set(metric); - }, - }) -); +// Note: `buffer_replay_count` is already registered by the allBuffers loop +// above (replayBuffer is in that list), so no standalone gauge here — a second +// registration with the same name throws and crashes the worker on boot. // ----------------------------------------------------------------------- // Session lifecycle metrics (new session-buffer + reaper world) diff --git a/packages/db/src/buffers/base-buffer.ts b/packages/db/src/buffers/base-buffer.ts index ff8634890..5c279788a 100644 --- a/packages/db/src/buffers/base-buffer.ts +++ b/packages/db/src/buffers/base-buffer.ts @@ -153,7 +153,7 @@ export class BaseBuffer { */ protected getYieldInterval( batchSize: number, - opts: { min?: number; max?: number; targetYieldsPerFlush?: number } = {}, + opts: { min?: number; max?: number; targetYieldsPerFlush?: number } = {} ): number { const target = opts.targetYieldsPerFlush ?? 20; const min = opts.min ?? 100; @@ -183,7 +183,7 @@ export class BaseBuffer { for (const line of lines) { yield line; } - })(), + })() ); } @@ -371,7 +371,7 @@ export class BaseBuffer { } else if (obs.result === 'locked') { this.logger.debug(logPayload, `Flush skipped for ${this.name} (locked)`); } else { - this.logger.info(logPayload, `Flush completed for ${this.name}`); + this.logger.debug(logPayload, `Flush completed for ${this.name}`); } } diff --git a/packages/db/src/buffers/session-buffer.test.ts b/packages/db/src/buffers/session-buffer.test.ts index e72a26d17..1150d1c7f 100644 --- a/packages/db/src/buffers/session-buffer.test.ts +++ b/packages/db/src/buffers/session-buffer.test.ts @@ -167,6 +167,48 @@ describe('SessionBuffer', () => { } }); + it('invariant: one visit keeps one id across extends; a >30min gap splits and closes the first', async () => { + // This ties the layers together against real Redis: ingest() drives the + // state machine, and getExistingSession() is what the API reads on each + // request to resolve the id — they must agree throughout a visit. + const t0 = new Date('2026-06-08T12:00:00.000Z'); + const at = (mins: number) => new Date(t0.getTime() + mins * 60_000); + + const e1 = await sessionBuffer.ingest( + makePayload({ createdAt: t0, sessionId: 'S1' }) + ); + expect(e1?.kind).toBe('new'); + expect(e1?.current.id).toBe('S1'); + // The API's next lookup must see the live session. + expect( + (await sessionBuffer.getExistingSession({ projectId, deviceId }))?.id + ).toBe('S1'); + + // Events within the idle window extend the SAME session. + for (const mins of [10, 20]) { + const ext = await sessionBuffer.ingest( + makePayload({ createdAt: at(mins), sessionId: 'S1' }) + ); + expect(ext?.kind).toBe('extend'); + expect(ext?.current.id).toBe('S1'); + } + + // A >30min gap: ingest detects the boundary, closes S1, opens S2. + const e4 = await sessionBuffer.ingest( + makePayload({ createdAt: at(55), sessionId: 'S2' }) + ); + expect(e4?.kind).toBe('boundary'); + if (e4?.kind === 'boundary') { + expect(e4.closed.id).toBe('S1'); + expect(e4.current.id).toBe('S2'); + } + + // The slot now holds the new session — the API reads S2 from here on. + expect( + (await sessionBuffer.getExistingSession({ projectId, deviceId }))?.id + ).toBe('S2'); + }); + it('inherits utm_* fields from event.properties.__query', async () => { await sessionBuffer.ingest( makePayload({ diff --git a/packages/db/src/buffers/session-buffer.ts b/packages/db/src/buffers/session-buffer.ts index 54b0ffc4f..3e4a006df 100644 --- a/packages/db/src/buffers/session-buffer.ts +++ b/packages/db/src/buffers/session-buffer.ts @@ -7,7 +7,12 @@ import type { IClickhouseSession } from '../services/session.service'; import { BaseBuffer } from './base-buffer'; // 30min of idle in event-time → session ends. Matches industry default. -export const SESSION_TIMEOUT_MS = 1000 * 60 * 30; +// Idle window for a session (boundary detection + the reaper's default deadman). +// Env-overridable so E2E tests can shrink it from 30min to a few seconds. +export const SESSION_TIMEOUT_MS = Number.parseInt( + process.env.SESSION_TIMEOUT_MS || String(1000 * 60 * 30), + 10 +); const sessionKey = (projectId: string, deviceId: string) => `session:${projectId}:${deviceId}`; @@ -72,10 +77,6 @@ export class SessionBuffer extends BaseBuffer { ? Number.parseInt(process.env.SESSION_BUFFER_CHUNK_SIZE, 10) : 1000; - private readonly squashEnabled = - process.env.SESSION_BUFFER_SQUASH !== 'false' && - process.env.SESSION_BUFFER_SQUASH !== '0'; - private readonly redisKey = 'session-buffer'; private redis: Redis; constructor() { @@ -356,114 +357,14 @@ export class SessionBuffer extends BaseBuffer { for (const row of chRows) { multi.rpush(this.redisKey, JSON.stringify(row)); } - multi.llen(this.redisKey); - const result = await multi.exec(); + await multi.exec(); - // Read ground-truth queue length from the appended LLEN result so we - // avoid an extra round-trip to decide whether to flush. - const llenIndex = (result?.length ?? 1) - 1; - const bufferLength = (result?.[llenIndex]?.[1] as number) ?? 0; + const bufferLength = await this.getBufferSize(); if (bufferLength >= this.batchSize) { await this.tryFlush(); } } - /** - * Squash the (sign=-1, sign=+1) rows for each session id down to the - * minimal set that produces the same collapsed state in ClickHouse, by - * netting the signs per version. Single-row sessions pass through - * unchanged. - * - * `sessions` uses VersionedCollapsingMergeTree(sign, version): a (-1, V) - * row only collapses against a (+1, V) row of the SAME version. So the - * net effect of a batch on a session is computed per-version — sum the - * signs at each version and emit only the rows needed to realise that - * net. Versions that fully cancel (net 0) are dropped; the surviving - * rows are normally the final (+1, maxVersion) plus, for a mid-life - * session, the (-1, V) that cancels CH's resident row. - * - * This per-version netting is required because a session's creation row - * (+1 at the base version) can land in the SAME batch as its updates. - * The first update emits a (-1, baseVersion) whose only partner is that - * in-batch creation (+1, baseVersion). They must cancel each other here; - * naively keeping "oldest -1 + newest +1" would drop the creation +1 - * while keeping its canceller, leaving a permanently un-collapsible (-1) - * row in CH (duplicate session rows + negative bounce counts). - */ - private squashSessionsByVersion( - sessions: IClickhouseSession[] - ): IClickhouseSession[] { - if (sessions.length <= 1) { - return sessions; - } - - const grouped = new Map(); - for (const s of sessions) { - const arr = grouped.get(s.id); - if (arr) { - arr.push(s); - } else { - grouped.set(s.id, [s]); - } - } - - let squashedCount = 0; - const out: IClickhouseSession[] = []; - - for (const entries of grouped.values()) { - if (entries.length === 1) { - out.push(entries[0]!); - continue; - } - - const netByVersion = new Map< - number, - { net: number; pos?: IClickhouseSession; neg?: IClickhouseSession } - >(); - for (const e of entries) { - const slot = netByVersion.get(e.version) ?? { net: 0 }; - slot.net += e.sign; - if (e.sign === 1) { - slot.pos = e; - } else if (e.sign === -1) { - slot.neg = e; - } - netByVersion.set(e.version, slot); - } - - const emitted: IClickhouseSession[] = []; - for (const slot of netByVersion.values()) { - if (slot.net === 0) { - continue; - } - const row = slot.net > 0 ? slot.pos : slot.neg; - if (!row) { - continue; - } - for (let i = 0; i < Math.abs(slot.net); i++) { - emitted.push(row); - } - } - - squashedCount += entries.length - emitted.length; - out.push(...emitted); - } - - if (squashedCount > 0) { - this.logger.debug( - { - inputRows: sessions.length, - outputRows: out.length, - dropped: squashedCount, - uniqueSessions: grouped.size, - }, - 'Session batch squashed' - ); - } - - return out; - } - async processBuffer() { const events = await this.redis.lrange( this.redisKey, @@ -472,25 +373,10 @@ export class SessionBuffer extends BaseBuffer { ); if (events.length === 0) return; - const parsed = events + const sessions = events .map((e) => getSafeJson(e)) - .filter((s): s is IClickhouseSession => s !== null) - .map((s) => ({ ...s, duration: Math.max(0, s.duration || 0) })); - - // A single high-activity session produces many (sign=-1, sign=+1) - // pairs within one flush window — every extend appends a pair. The - // VersionedCollapsingMergeTree collapses them at merge time, so the - // final state is correct either way, but inserting all the - // intermediate rows costs network bytes, gzip CPU, and CH ingest + - // merge work. Squash per-version to insert only the boundary rows. - // - // Set SESSION_BUFFER_SQUASH=false to disable as a safety hatch. - const sessions = this.squashEnabled - ? this.squashSessionsByVersion(parsed) - : parsed; - - // Insert before trimming so a failed CH insert leaves the queue intact — - // errors propagate to tryFlush, which resyncs and retries. Never swallow. + .map((s) => ({ ...s, duration: Math.max(0, s?.duration || 0) })); + for (const chunk of this.chunks(sessions, this.chunkSize)) { await ch.insert({ table: TABLE_NAMES.sessions, @@ -499,6 +385,9 @@ export class SessionBuffer extends BaseBuffer { }); } + // Trim only after a successful insert — on failure the rows stay queued. + // Don't swallow: let it propagate so the base flush records result:'error' + // (and the next flush retries the still-queued rows). await this.redis.ltrim(this.redisKey, events.length, -1); this.logger.debug({ count: events.length }, 'Processed sessions'); From 9240e7d2a01e4e171bb029d133348cf7ec430a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Fri, 12 Jun 2026 06:20:16 +0200 Subject: [PATCH 5/5] apply session fix --- .../db/src/buffers/session-buffer.test.ts | 7 + packages/db/src/buffers/session-buffer.ts | 123 +++++++++++++++++- 2 files changed, 127 insertions(+), 3 deletions(-) diff --git a/packages/db/src/buffers/session-buffer.test.ts b/packages/db/src/buffers/session-buffer.test.ts index 1150d1c7f..e6ba88bca 100644 --- a/packages/db/src/buffers/session-buffer.test.ts +++ b/packages/db/src/buffers/session-buffer.test.ts @@ -334,6 +334,13 @@ describe('SessionBuffer', () => { await sessionBuffer.processBuffer(); + // Squash must actually fire. The 8 events queue 1 creation + 7 + // (-1,vN)/(+1,vN+1) pairs = 15 raw rows; for a fresh session with no + // CH-resident row they all net down to the single final (+1, v8). If + // squash is disabled/removed, every raw row is inserted and this is 15 + // — this assertion is the guard that the optimisation is wired in. + expect(inserted.length).toBe(1); + // Net the inserted rows per version exactly as CH's // VersionedCollapsingMergeTree(sign, version) would: a row survives only // if positives and negatives at its version don't cancel. diff --git a/packages/db/src/buffers/session-buffer.ts b/packages/db/src/buffers/session-buffer.ts index 3e4a006df..8ff03ac63 100644 --- a/packages/db/src/buffers/session-buffer.ts +++ b/packages/db/src/buffers/session-buffer.ts @@ -77,6 +77,10 @@ export class SessionBuffer extends BaseBuffer { ? Number.parseInt(process.env.SESSION_BUFFER_CHUNK_SIZE, 10) : 1000; + private readonly squashEnabled = + process.env.SESSION_BUFFER_SQUASH !== 'false' && + process.env.SESSION_BUFFER_SQUASH !== '0'; + private readonly redisKey = 'session-buffer'; private redis: Redis; constructor() { @@ -365,6 +369,102 @@ export class SessionBuffer extends BaseBuffer { } } + /** + * Squash the (sign=-1, sign=+1) rows for each session id down to the + * minimal set that produces the same collapsed state in ClickHouse, by + * netting the signs per version. Single-row sessions pass through + * unchanged. + * + * `sessions` uses VersionedCollapsingMergeTree(sign, version): a (-1, V) + * row only collapses against a (+1, V) row of the SAME version. So the + * net effect of a batch on a session is computed per-version — sum the + * signs at each version and emit only the rows needed to realise that + * net. Versions that fully cancel (net 0) are dropped; the surviving + * rows are normally the final (+1, maxVersion) plus, for a mid-life + * session, the (-1, V) that cancels CH's resident row. + * + * This per-version netting is required because a session's creation row + * (+1 at the base version) can land in the SAME batch as its updates. + * The first update emits a (-1, baseVersion) whose only partner is that + * in-batch creation (+1, baseVersion). They must cancel each other here; + * naively keeping "oldest -1 + newest +1" would drop the creation +1 + * while keeping its canceller, leaving a permanently un-collapsible (-1) + * row in CH (duplicate session rows + negative bounce counts). + */ + private squashSessionsByVersion( + sessions: IClickhouseSession[] + ): IClickhouseSession[] { + if (sessions.length <= 1) { + return sessions; + } + + const grouped = new Map(); + for (const s of sessions) { + const arr = grouped.get(s.id); + if (arr) { + arr.push(s); + } else { + grouped.set(s.id, [s]); + } + } + + let squashedCount = 0; + const out: IClickhouseSession[] = []; + + for (const entries of grouped.values()) { + if (entries.length === 1) { + out.push(entries[0]!); + continue; + } + + const netByVersion = new Map< + number, + { net: number; pos?: IClickhouseSession; neg?: IClickhouseSession } + >(); + for (const e of entries) { + const slot = netByVersion.get(e.version) ?? { net: 0 }; + slot.net += e.sign; + if (e.sign === 1) { + slot.pos = e; + } else if (e.sign === -1) { + slot.neg = e; + } + netByVersion.set(e.version, slot); + } + + const emitted: IClickhouseSession[] = []; + for (const slot of netByVersion.values()) { + if (slot.net === 0) { + continue; + } + const row = slot.net > 0 ? slot.pos : slot.neg; + if (!row) { + continue; + } + for (let i = 0; i < Math.abs(slot.net); i++) { + emitted.push(row); + } + } + + squashedCount += entries.length - emitted.length; + out.push(...emitted); + } + + if (squashedCount > 0) { + this.logger.debug( + { + inputRows: sessions.length, + outputRows: out.length, + dropped: squashedCount, + uniqueSessions: grouped.size, + }, + 'Session batch squashed' + ); + } + + return out; + } + async processBuffer() { const events = await this.redis.lrange( this.redisKey, @@ -373,9 +473,26 @@ export class SessionBuffer extends BaseBuffer { ); if (events.length === 0) return; - const sessions = events - .map((e) => getSafeJson(e)) - .map((s) => ({ ...s, duration: Math.max(0, s?.duration || 0) })); + const parsed: IClickhouseSession[] = []; + for (const e of events) { + const s = getSafeJson(e); + if (!s) continue; + // Freshly parsed object — safe to clamp in place, no clone needed. + s.duration = Math.max(0, s.duration || 0); + parsed.push(s); + } + + // A single high-activity session produces many (sign=-1, sign=+1) + // pairs within one flush window — every extend appends a pair. The + // VersionedCollapsingMergeTree collapses them at merge time, so the + // final state is correct either way, but inserting all the + // intermediate rows costs network bytes, gzip CPU, and CH ingest + + // merge work. Squash per-version to insert only the boundary rows. + // + // Set SESSION_BUFFER_SQUASH=false to disable as a safety hatch. + const sessions = this.squashEnabled + ? this.squashSessionsByVersion(parsed) + : parsed; for (const chunk of this.chunks(sessions, this.chunkSize)) { await ch.insert({