From 087fda7ce6256ab8a96ec2e2f6c007b34e07f631 Mon Sep 17 00:00:00 2001 From: DMleadgen Date: Tue, 7 Apr 2026 15:38:55 -0600 Subject: [PATCH] Add manuals knowledge retrieval and corpus tooling --- app/api/admin/manuals-knowledge/route.test.ts | 48 + app/api/admin/manuals-knowledge/route.ts | 79 + app/api/chat/route.test.ts | 181 ++ app/api/chat/route.ts | 59 +- docs/operations/JESSICA_MANUALS_KNOWLEDGE.md | 39 + docs/operations/MANUALS_QDRANT_READINESS.md | 40 + lib/manuals-knowledge.test.ts | 79 + lib/manuals-knowledge.ts | 1604 +++++++++++++++ lib/manuals-qdrant-corpus.test.ts | 114 ++ lib/manuals-qdrant-corpus.ts | 1734 +++++++++++++++++ lib/site-chat/prompt.ts | 19 +- scripts/build-manuals-qdrant-corpus.ts | 37 + scripts/evaluate-manuals-qdrant-corpus.ts | 33 + 13 files changed, 4060 insertions(+), 6 deletions(-) create mode 100644 app/api/admin/manuals-knowledge/route.test.ts create mode 100644 app/api/admin/manuals-knowledge/route.ts create mode 100644 app/api/chat/route.test.ts create mode 100644 docs/operations/JESSICA_MANUALS_KNOWLEDGE.md create mode 100644 docs/operations/MANUALS_QDRANT_READINESS.md create mode 100644 lib/manuals-knowledge.test.ts create mode 100644 lib/manuals-knowledge.ts create mode 100644 lib/manuals-qdrant-corpus.test.ts create mode 100644 lib/manuals-qdrant-corpus.ts create mode 100644 scripts/build-manuals-qdrant-corpus.ts create mode 100644 scripts/evaluate-manuals-qdrant-corpus.ts diff --git a/app/api/admin/manuals-knowledge/route.test.ts b/app/api/admin/manuals-knowledge/route.test.ts new file mode 100644 index 00000000..7812f3e7 --- /dev/null +++ b/app/api/admin/manuals-knowledge/route.test.ts @@ -0,0 +1,48 @@ +import assert from "node:assert/strict" +import test from "node:test" +import { GET } from "@/app/api/admin/manuals-knowledge/route" + +const ORIGINAL_ADMIN_API_TOKEN = process.env.ADMIN_API_TOKEN + +test.afterEach(() => { + if (typeof ORIGINAL_ADMIN_API_TOKEN === "string") { + process.env.ADMIN_API_TOKEN = ORIGINAL_ADMIN_API_TOKEN + } else { + delete process.env.ADMIN_API_TOKEN + } +}) + +test("manuals knowledge admin route requires admin auth", async () => { + process.env.ADMIN_API_TOKEN = "secret-token" + + const response = await GET( + new Request("http://localhost/api/admin/manuals-knowledge?query=rvv+660") + ) + + assert.equal(response.status, 401) +}) + +test("manuals knowledge admin route returns retrieval details for authorized queries", async () => { + process.env.ADMIN_API_TOKEN = "secret-token" + + const response = await GET( + new Request( + "http://localhost/api/admin/manuals-knowledge?query=RVV+660+service+manual", + { + headers: { + "x-admin-token": "secret-token", + }, + } + ) + ) + + assert.equal(response.status, 200) + + const body = await response.json() + + assert.equal(body.summary.ran, true) + assert.equal(Array.isArray(body.result.manualCandidates), true) + assert.equal(body.result.manualCandidates.length > 0, true) + assert.equal(Array.isArray(body.result.topChunks), true) + assert.equal(Array.isArray(body.summary.topChunkCitations), true) +}) diff --git a/app/api/admin/manuals-knowledge/route.ts b/app/api/admin/manuals-knowledge/route.ts new file mode 100644 index 00000000..df11efd4 --- /dev/null +++ b/app/api/admin/manuals-knowledge/route.ts @@ -0,0 +1,79 @@ +import { NextResponse } from "next/server" +import { + getManualCitationContext, + retrieveManualContext, + summarizeManualRetrieval, +} from "@/lib/manuals-knowledge" +import { requireAdminToken } from "@/lib/server/admin-auth" + +function normalizeQuery(value: string | null) { + return (value || "").trim().slice(0, 400) +} + +export async function GET(request: Request) { + const authError = requireAdminToken(request) + if (authError) { + return authError + } + + try { + const { searchParams } = new URL(request.url) + const query = normalizeQuery(searchParams.get("query")) + const manufacturer = normalizeQuery(searchParams.get("manufacturer")) || null + const model = normalizeQuery(searchParams.get("model")) || null + const manualId = normalizeQuery(searchParams.get("manualId")) || null + const pageParam = searchParams.get("page") + const pageNumber = + pageParam && Number.isFinite(Number(pageParam)) + ? Number.parseInt(pageParam, 10) + : undefined + + if (!query) { + return NextResponse.json( + { error: "A query parameter is required." }, + { status: 400 } + ) + } + + const result = await retrieveManualContext(query, { + manufacturer, + model, + manualId, + }) + const citationContext = + manualId || result.bestManual?.manualId + ? await getManualCitationContext( + manualId || result.bestManual?.manualId || "", + pageNumber + ) + : null + + return NextResponse.json({ + query, + filters: { + manufacturer, + model, + manualId, + pageNumber: pageNumber ?? null, + }, + summary: summarizeManualRetrieval({ + ran: true, + query, + result, + }), + result, + citationContext, + }) + } catch (error) { + console.error("Failed to inspect manuals knowledge:", error) + return NextResponse.json( + { + error: + error instanceof Error + ? error.message + : "Failed to inspect manuals knowledge", + }, + { status: 500 } + ) + } +} diff --git a/app/api/chat/route.test.ts b/app/api/chat/route.test.ts new file mode 100644 index 00000000..14fd7c70 --- /dev/null +++ b/app/api/chat/route.test.ts @@ -0,0 +1,181 @@ +import assert from "node:assert/strict" +import test from "node:test" +import { NextRequest } from "next/server" +import { POST } from "@/app/api/chat/route" + +type CapturedPayload = { + model: string + messages: Array<{ role: string; content: string }> +} + +const ORIGINAL_FETCH = globalThis.fetch +const ORIGINAL_XAI_KEY = process.env.XAI_API_KEY + +function buildVisitor(intent: string) { + return { + name: "Taylor", + phone: "(801) 555-1000", + email: "taylor@example.com", + intent, + serviceTextConsent: true, + marketingTextConsent: false, + consentVersion: "sms-consent-v1-2026-03-26", + consentCapturedAt: "2026-03-25T00:00:00.000Z", + consentSourcePage: "/contact-us", + } +} + +function buildRequest(message: string, intent = "Manuals") { + return new NextRequest("http://localhost/api/chat", { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + pathname: "/manuals", + sessionId: "test-session", + visitor: buildVisitor(intent), + messages: [{ role: "user", content: message }], + }), + }) +} + +async function runChatRouteWithSpy( + message: string, + intent = "Manuals" +): Promise<{ response: Response; payload: CapturedPayload }> { + process.env.XAI_API_KEY = "test-xai-key" + let capturedPayload: CapturedPayload | null = null + + globalThis.fetch = (async (_input: RequestInfo | URL, init?: RequestInit) => { + capturedPayload = JSON.parse(String(init?.body || "{}")) as CapturedPayload + + return new Response( + JSON.stringify({ + choices: [ + { + message: { + content: "Mock Jessica reply.", + }, + }, + ], + }), + { + status: 200, + headers: { + "content-type": "application/json", + }, + } + ) + }) as typeof fetch + + const response = await POST(buildRequest(message, intent)) + + assert.ok(capturedPayload) + return { response, payload: capturedPayload } +} + +test.afterEach(() => { + globalThis.fetch = ORIGINAL_FETCH + + if (typeof ORIGINAL_XAI_KEY === "string") { + process.env.XAI_API_KEY = ORIGINAL_XAI_KEY + } else { + delete process.env.XAI_API_KEY + } +}) + +test("chat route includes grounded manual context for RVV alias lookups", async () => { + const { response, payload } = await runChatRouteWithSpy( + "RVV 660 service manual" + ) + + assert.equal(response.status, 200) + assert.equal( + payload.messages.some( + (message) => + message.role === "system" && + message.content.includes("Manual knowledge context:") + ), + true + ) + assert.equal( + payload.messages.some( + (message) => + message.role === "system" && + /Royal Vendors|660/i.test(message.content) + ), + true + ) +}) + +test("chat route resolves Narco alias lookups into manual context", async () => { + const { payload } = await runChatRouteWithSpy("Narco bevmax not cooling") + + const manualContext = payload.messages.find( + (message) => + message.role === "system" && + message.content.includes("Manual knowledge context:") + ) + + assert.ok(manualContext) + assert.match(manualContext.content, /Dixie-Narco|Narco/i) +}) + +test("chat route low-confidence manual queries instruct Jessica to ask for brand model or photo", async () => { + const { payload } = await runChatRouteWithSpy( + "manual for flibbertigibbet machine" + ) + + const manualContext = payload.messages.find( + (message) => + message.role === "system" && + message.content.includes("Manual knowledge context:") + ) + + assert.ok(manualContext) + assert.match( + manualContext.content, + /brand on the front|model sticker|photo\/video/i + ) +}) + +test("chat route risky technical manual queries inject conservative safety context", async () => { + const { payload } = await runChatRouteWithSpy( + "Royal wiring diagram voltage manual", + "Repairs" + ) + + const systemPrompt = payload.messages[0]?.content || "" + const manualContext = payload.messages.find( + (message) => + message.role === "system" && + message.content.includes("Manual knowledge context:") + ) + + assert.match( + systemPrompt, + /Do not provide step-by-step repair procedures, wiring guidance, voltage guidance/i + ) + assert.ok(manualContext) + assert.match(manualContext.content, /technical or risky/i) +}) + +test("chat route skips manuals retrieval for non-manual conversations", async () => { + const { payload } = await runChatRouteWithSpy( + "Can someone call me back about free placement?", + "Free Placement" + ) + + const systemMessages = payload.messages.filter( + (message) => message.role === "system" + ) + + assert.equal(systemMessages.length, 1) + assert.equal( + systemMessages.some((message) => + message.content.includes("Manual knowledge context:") + ), + false + ) +}) diff --git a/app/api/chat/route.ts b/app/api/chat/route.ts index 350505d9..b6bad3d6 100644 --- a/app/api/chat/route.ts +++ b/app/api/chat/route.ts @@ -17,12 +17,18 @@ import { SITE_CHAT_TEMPERATURE, isSiteChatSuppressedRoute, } from "@/lib/site-chat/config" -import { SITE_CHAT_SYSTEM_PROMPT } from "@/lib/site-chat/prompt" +import { buildSiteChatSystemPrompt } from "@/lib/site-chat/prompt" import { consumeChatOutput, consumeChatRequest, getChatRateLimitStatus, } from "@/lib/site-chat/rate-limit" +import { + formatManualContextForPrompt, + retrieveManualContext, + shouldUseManualKnowledgeForChat, + summarizeManualRetrieval, +} from "@/lib/manuals-knowledge" import { createSmsConsentPayload } from "@/lib/sms-compliance" type ChatRole = "user" | "assistant" @@ -208,6 +214,15 @@ function extractAssistantText(data: any) { return "" } +function buildManualKnowledgeQuery(messages: ChatMessage[]) { + return messages + .filter((message) => message.role === "user") + .slice(-3) + .map((message) => message.content.trim()) + .filter(Boolean) + .join(" ") +} + export async function POST(request: NextRequest) { const responseHeaders: Record = { "Cache-Control": "no-store", @@ -299,6 +314,36 @@ export async function POST(request: NextRequest) { sessionId, }) + const manualKnowledgeQuery = buildManualKnowledgeQuery(messages) + const shouldUseManualKnowledge = shouldUseManualKnowledgeForChat( + visitor.intent, + manualKnowledgeQuery + ) + let manualKnowledge = null + let manualKnowledgeError: unknown = null + if (shouldUseManualKnowledge) { + try { + manualKnowledge = await retrieveManualContext(manualKnowledgeQuery) + } catch (error) { + manualKnowledgeError = error + console.error("[site-chat] manuals knowledge lookup failed", { + pathname, + sessionId, + error, + }) + } + } + console.info( + "[site-chat] manuals retrieval", + summarizeManualRetrieval({ + ran: shouldUseManualKnowledge, + query: manualKnowledgeQuery, + result: manualKnowledge, + error: manualKnowledgeError, + }) + ) + const systemPrompt = buildSiteChatSystemPrompt() + const xaiApiKey = getOptionalEnv("XAI_API_KEY") if (!xaiApiKey) { console.warn("[site-chat] missing XAI_API_KEY", { @@ -331,8 +376,18 @@ export async function POST(request: NextRequest) { messages: [ { role: "system", - content: `${SITE_CHAT_SYSTEM_PROMPT}\n\nConversation context:\n- Current pathname: ${pathname}\n- Source: ${SITE_CHAT_SOURCE}\n- Visitor name: ${visitor.name}\n- Visitor email: ${visitor.email}\n- Visitor phone: ${visitor.phone}\n- Visitor intent: ${visitor.intent}\n- Service SMS consent: ${visitor.serviceTextConsent ? "yes" : "no"}\n- Marketing SMS consent: ${visitor.marketingTextConsent ? "yes" : "no"}`, + content: `${systemPrompt}\n\nConversation context:\n- Current pathname: ${pathname}\n- Source: ${SITE_CHAT_SOURCE}\n- Visitor name: ${visitor.name}\n- Visitor email: ${visitor.email}\n- Visitor phone: ${visitor.phone}\n- Visitor intent: ${visitor.intent}\n- Service SMS consent: ${visitor.serviceTextConsent ? "yes" : "no"}\n- Marketing SMS consent: ${visitor.marketingTextConsent ? "yes" : "no"}`, }, + ...(shouldUseManualKnowledge + ? [ + { + role: "system" as const, + content: manualKnowledge + ? formatManualContextForPrompt(manualKnowledge) + : "Manual knowledge context:\n- A manual lookup was attempted, but no reliable manual context is available.\n- Do not guess. Ask for the brand, model sticker, or a clear photo/video that can be texted in.", + }, + ] + : []), ...messages, ], }), diff --git a/docs/operations/JESSICA_MANUALS_KNOWLEDGE.md b/docs/operations/JESSICA_MANUALS_KNOWLEDGE.md new file mode 100644 index 00000000..30c12243 --- /dev/null +++ b/docs/operations/JESSICA_MANUALS_KNOWLEDGE.md @@ -0,0 +1,39 @@ +# Jessica Manuals Knowledge + +## What feeds the manuals knowledge layer +- Primary source: tenant-filtered exports from the shared `manuals-platform` package. +- Rocky consumes `manuals-platform/output/tenants/rocky-mountain-vending/manuals.json`. +- Rocky consumes `manuals-platform/output/tenants/rocky-mountain-vending/chunks.json`. +- If shared exports are missing in local development, the RMV app can still fall back to its in-repo builder. + +## How the corpus is built +- The shared `manuals-platform` package scans the portfolio manuals tree, assigns tenant entitlements, and writes prebuilt artifacts. +- RMV loads the Rocky tenant artifact on first use after process start. +- Public Jessica retrieval is therefore consuming a tenant-filtered export rather than rebuilding the raw manuals corpus itself. + +## How new manuals become searchable +- Add or update source PDFs under `manuals-data`. +- Rebuild the shared package artifacts from `manuals-platform` so tenant exports are refreshed. +- Restart the Next.js server or deployment so RMV reloads the updated tenant artifact on first use. + +## Cache refresh behavior +- The shared package writes persistent JSON artifacts under `manuals-platform/output`. +- RMV still caches the loaded Rocky tenant artifact in memory. +- A manual cache reset helper exists in `lib/manuals-knowledge.ts` for future admin tooling or deploy hooks. +- Today, the simplest refresh flow is: rebuild shared artifacts, then restart the app. + +## Observability +- The site chat route logs a metadata-only manuals retrieval summary before the xAI request. +- The logs include whether retrieval ran, top manual candidate IDs, top chunk citations, clarification state, risk flag, and any retrieval error. +- Full chunk text is not logged. + +## Internal debug surface +- Internal endpoint: `GET /api/admin/manuals-knowledge` +- Auth: `x-admin-token` or `Authorization: Bearer ` +- Example query: + - `query=RVV 660 service manual` + - optional `manufacturer` + - optional `model` + - optional `manualId` + - optional `page` +- The endpoint returns retrieval summary, matched manuals, top chunks, and citation context for internal inspection only. diff --git a/docs/operations/MANUALS_QDRANT_READINESS.md b/docs/operations/MANUALS_QDRANT_READINESS.md new file mode 100644 index 00000000..97eb9398 --- /dev/null +++ b/docs/operations/MANUALS_QDRANT_READINESS.md @@ -0,0 +1,40 @@ +# Manuals Qdrant Readiness + +## Purpose +- The long-term source of truth for this pipeline is now the shared `manuals-platform` package at the workspace root. +- The RMV repo keeps this document as a consumer-side reference for the tenant-filtered artifacts Rocky reads. + +## Source inputs +- Shared package location: `../manuals-platform` +- Shared build outputs: `../manuals-platform/output/full/*` +- Rocky tenant outputs: `../manuals-platform/output/tenants/rocky-mountain-vending/*` + +## What the corpus builder does +- The shared package scans the full portfolio manual set, classifies every PDF, assigns tenant entitlements, and publishes tenant-filtered Qdrant-ready artifacts. +- It keeps `public_safe` and `internal_tech` retrieval profiles on top of one central corpus. +- Rocky consumes the prebuilt Rocky tenant export instead of rebuilding from raw manuals data inside the app. + +## Build and evaluation commands +- Build artifacts: + - `pnpm manuals:qdrant:build` +- Build artifacts into a custom directory: + - `pnpm manuals:qdrant:build -- --output-dir /absolute/path` +- Run the evaluation set: + - `pnpm manuals:qdrant:eval` + +## Artifact output +- Default output directory: `output/manuals-qdrant` +- Important files: + - `summary.json` + - `manuals.json` + - `chunks.json` + - `chunks-high-confidence.json` + - `chunks-public-safe.json` + - `chunks-internal-tech.json` + - `evaluation-cases.json` + - `evaluation-report.json` + +## Operational notes +- The first Qdrant prototype should ingest `chunks-high-confidence.json` or `chunks-internal-tech.json`, not the full raw corpus. +- Public-facing experiences should stay on `public_safe` filters even after Qdrant is introduced. +- After manuals-data changes, rebuild the artifacts so the new normalized corpus and evaluation report stay in sync. diff --git a/lib/manuals-knowledge.test.ts b/lib/manuals-knowledge.test.ts new file mode 100644 index 00000000..8a28b3f4 --- /dev/null +++ b/lib/manuals-knowledge.test.ts @@ -0,0 +1,79 @@ +import assert from "node:assert/strict" +import test from "node:test" +import { + findManualCandidates, + getManualCitationContext, + resetManualKnowledgeCache, + retrieveManualContext, + shouldUseManualKnowledgeForChat, +} from "@/lib/manuals-knowledge" + +test("shouldUseManualKnowledgeForChat only triggers for relevant conversations", () => { + assert.equal( + shouldUseManualKnowledgeForChat( + "Repairs", + "My Royal machine is not accepting coins" + ), + true + ) + assert.equal(shouldUseManualKnowledgeForChat("Other", "Hello there"), false) +}) + +test("findManualCandidates resolves RVV alias queries to Royal Vendors manuals", async () => { + const candidates = await findManualCandidates("RVV 660 service manual") + + assert.ok(candidates.length > 0) + assert.equal(candidates[0]?.manufacturer, "Royal Vendors") + assert.match(candidates[0]?.filename || "", /660|700|gii|giii|rvv/i) +}) + +test("findManualCandidates resolves Narco-style queries to Dixie-Narco manuals", async () => { + const candidates = await findManualCandidates("Narco bevmax not cooling") + + assert.ok(candidates.length > 0) + assert.equal(candidates[0]?.manufacturer, "Dixie-Narco") +}) + +test("retrieveManualContext returns grounded troubleshooting chunks for simple public help", async () => { + const result = await retrieveManualContext("Royal machine not accepting coins") + + assert.ok(result.manualCandidates.length > 0) + assert.equal(result.topChunks.length > 0, true) + assert.equal(result.topChunks[0]?.manufacturer, "Royal Vendors") + assert.match(result.topChunks[0]?.text || "", /not accepting coins/i) + assert.equal(result.isRisky, false) +}) + +test("getManualCitationContext returns citations for a retrieved manual page", async () => { + const result = await retrieveManualContext("Royal machine not accepting coins") + const firstChunk = result.topChunks[0] + + assert.ok(firstChunk) + + const citationContext = await getManualCitationContext( + firstChunk.manualId, + firstChunk.pageNumber || undefined + ) + + assert.ok(citationContext.manual) + assert.ok(citationContext.citations.length > 0) + assert.equal( + citationContext.citations.some( + (citation) => citation.pageNumber === firstChunk.pageNumber + ), + true + ) +}) + +test("resetManualKnowledgeCache rebuilds the manuals corpus on demand", async () => { + const beforeReset = await findManualCandidates("RVV 660 service manual") + + resetManualKnowledgeCache() + + const afterReset = await findManualCandidates("RVV 660 service manual") + + assert.ok(beforeReset.length > 0) + assert.ok(afterReset.length > 0) + assert.equal(beforeReset[0]?.manufacturer, afterReset[0]?.manufacturer) + assert.equal(beforeReset[0]?.manualId, afterReset[0]?.manualId) +}) diff --git a/lib/manuals-knowledge.ts b/lib/manuals-knowledge.ts new file mode 100644 index 00000000..5963321c --- /dev/null +++ b/lib/manuals-knowledge.ts @@ -0,0 +1,1604 @@ +import { existsSync } from "node:fs" +import { readFile, readdir } from "node:fs/promises" +import { basename, join } from "node:path" +import { listConvexManuals } from "@/lib/convex-service" +import { scanManuals } from "@/lib/manuals" +import { getManualsDataRoot } from "@/lib/manuals-paths" +import { + buildManualAssetUrl, + buildThumbnailAssetUrl, +} from "@/lib/manuals-storage" +import type { Manual } from "@/lib/manuals-types" +import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types" + +const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"] +const STRUCTURED_MANUALS_DIR = [ + ...MANUALS_OPTIMIZED_ROOT, + "training_data", + "structured", +] +const EXTRACTED_CONTENT_FILE = [ + ...MANUALS_OPTIMIZED_ROOT, + "extracted_content", + "manuals_extracted_content.json", +] +const DEFAULT_MANUALS_PLATFORM_TENANT_ID = "rocky-mountain-vending" + +const STOPWORDS = new Set([ + "a", + "an", + "and", + "are", + "at", + "be", + "by", + "for", + "from", + "help", + "how", + "i", + "in", + "is", + "it", + "machine", + "manual", + "me", + "my", + "of", + "on", + "or", + "our", + "please", + "service", + "that", + "the", + "this", + "to", + "up", + "with", +]) + +const RISKY_MANUAL_KEYWORDS = [ + "wiring", + "diagram", + "voltage", + "compressor", + "refrigerant", + "bypass", + "jumper", + "board level", + "schematic", + "electrical", + "rewire", + "disassemble", + "tear down", +] + +const MANUAL_QUERY_HINTS = [ + "manual", + "model", + "serial", + "error", + "code", + "parts", + "part", + "troubleshoot", + "troubleshooting", + "not cooling", + "not vending", + "coin", + "bill acceptor", + "bill", + "coin mech", + "validator", + "jam", + "stuck", + "door", + "display", + "keypad", + "compressor", + "motor", + "sensor", +] + +const MANUFACTURER_ALIASES: Record = { + "automatic products": ["automatic products", "automatic-products", "ap"], + "coinco": ["coinco"], + "crane": [ + "crane", + "national", + "national vendors", + "merchant", + "merchant series", + "shopper", + ], + "dixie narco": [ + "dixie", + "narco", + "dixie narco", + "dixie-narco", + "dn", + "bevmax", + ], + "gpl": ["gpl", "general products"], + "mei mars": ["mei", "mars", "mei mars", "bill validator"], + "royal vendors": ["royal", "royal vendors", "royal vendor", "rvv"], + "rowe": ["rowe"], + "seaga": ["seaga"], + "usi": ["usi", "u select it", "u-select-it", "uselectit"], + "vendo": ["vendo", "sanden"], +} + +export type ManualCandidate = { + manualId: string + filename: string + manufacturer: string + category: string + manualUrl: string | null + thumbnailUrl: string | null + score: number + confidence: number +} + +export type ManualKnowledgeChunk = { + manualId: string + filename: string + manufacturer: string + model: string | null + manualType: string + pageNumber: number | null + sectionTitle: string | null + text: string + manualUrl: string | null + thumbnailUrl: string | null + sourceConfidence: number + matchScore: number + citation: string +} + +export type ManualKnowledgeFilters = { + manufacturer?: string | null + manualId?: string | null + model?: string | null +} + +export type RetrieveManualContextResult = { + query: string + bestManual: ManualCandidate | null + manualCandidates: ManualCandidate[] + topChunks: ManualKnowledgeChunk[] + needsClarification: boolean + isRisky: boolean +} + +export type ManualCitationContext = { + manual: ManualCandidate | null + citations: ManualKnowledgeChunk[] +} + +export type ManualRetrievalSummary = { + ran: boolean + query: string + bestManualId: string | null + manualCandidateIds: string[] + topChunkCitations: string[] + needsClarification: boolean | null + isRisky: boolean | null + error: string | null +} + +export interface ManualKnowledgeProvider { + findManualCandidates(query: string): Promise + retrieveManualContext( + query: string, + filters?: ManualKnowledgeFilters + ): Promise + getManualCitationContext( + manualId: string, + pageNumber?: number + ): Promise +} + +type StructuredSection = { + title?: string + pageNumber?: number + text?: string +} + +type StructuredTroubleshooting = { + problem?: string + solution?: string + pageNumber?: number +} + +type StructuredPart = { + partNumber?: string + description?: string + pageNumber?: number +} + +type StructuredManualRecord = { + manualId?: string + manufacturer?: string + model?: string + manualType?: string + content?: { + sections?: StructuredSection[] + troubleshooting?: StructuredTroubleshooting[] + partsDatabase?: StructuredPart[] + } +} + +type ExtractedPage = { + pageNumber?: number + text?: string + wordCount?: number +} + +type ExtractedPartList = { + pageNumber?: number + parts?: StructuredPart[] +} + +type ExtractedManualRecord = { + filename?: string + sections?: StructuredSection[] + partsLists?: ExtractedPartList[] + text?: { + fullText?: string + pages?: ExtractedPage[] + } +} + +type SharedPlatformManualRecord = { + manualId: string + filename: string + manufacturer: string + category: string | null + model: string | null + manualType: string + sourcePath?: string | null + manualUrl?: string | null + thumbnailUrl?: string | null + sourceFilenames?: string[] +} + +type SharedPlatformChunkRecord = { + manualId: string + title?: string + manufacturer: string + model: string | null + manualType: string + pageNumber: number | null + sectionTitle: string | null + text: string + manualUrl?: string | null + thumbnailUrl?: string | null + metadataConfidence?: number + overallQuality?: number +} + +type PreparedQuery = { + normalized: string + tokens: string[] + modelTokens: string[] + manufacturerFilters: string[] + manualIdFilter: string | null +} + +type ManualInternal = { + manualId: string + filename: string + manufacturer: string + category: string + model: string | null + manualType: string + manualUrl: string | null + thumbnailUrl: string | null + searchText: string + tokenSet: Set + normalizedManufacturer: string +} + +type ChunkInternal = Omit & { + searchText: string + tokenSet: Set + matchScore: number +} + +type ManualKnowledgeArtifact = { + manuals: ManualInternal[] + manualById: Map + chunks: ChunkInternal[] + chunksByManualId: Map +} + +type ResettableManualKnowledgeProvider = ManualKnowledgeProvider & { + resetCache(): void +} + +let defaultProvider: ManualKnowledgeProvider | null = null + +export function getManualKnowledgeProvider(): ManualKnowledgeProvider { + if (!defaultProvider) { + defaultProvider = new LocalManualKnowledgeProvider() + } + return defaultProvider +} + +export async function findManualCandidates(query: string) { + return await getManualKnowledgeProvider().findManualCandidates(query) +} + +export async function retrieveManualContext( + query: string, + filters?: ManualKnowledgeFilters +) { + return await getManualKnowledgeProvider().retrieveManualContext(query, filters) +} + +export async function getManualCitationContext( + manualId: string, + pageNumber?: number +) { + return await getManualKnowledgeProvider().getManualCitationContext( + manualId, + pageNumber + ) +} + +export function resetManualKnowledgeCache() { + if ( + defaultProvider && + "resetCache" in defaultProvider && + typeof (defaultProvider as ResettableManualKnowledgeProvider).resetCache === + "function" + ) { + ;(defaultProvider as ResettableManualKnowledgeProvider).resetCache() + } +} + +export function summarizeManualRetrieval(args: { + ran: boolean + query: string + result?: RetrieveManualContextResult | null + error?: unknown +}): ManualRetrievalSummary { + const error = + args.error instanceof Error + ? args.error.message + : typeof args.error === "string" + ? args.error + : null + + if (!args.ran) { + return { + ran: false, + query: args.query, + bestManualId: null, + manualCandidateIds: [], + topChunkCitations: [], + needsClarification: null, + isRisky: null, + error, + } + } + + return { + ran: true, + query: args.query, + bestManualId: args.result?.bestManual?.manualId || null, + manualCandidateIds: (args.result?.manualCandidates || []) + .slice(0, 4) + .map((candidate) => candidate.manualId), + topChunkCitations: (args.result?.topChunks || []) + .slice(0, 4) + .map((chunk) => chunk.citation), + needsClarification: args.result?.needsClarification ?? null, + isRisky: args.result?.isRisky ?? null, + error, + } +} + +export function shouldUseManualKnowledgeForChat( + intent: string | null | undefined, + query: string +) { + const normalizedIntent = normalizeText(intent || "") + const normalizedQuery = normalizeText(query) + const hasManualHints = + looksLikeModelTokenQuery(normalizedQuery) || + MANUAL_QUERY_HINTS.some((hint) => normalizedQuery.includes(hint)) + + if (!normalizedQuery) { + return false + } + + if ( + normalizedIntent.includes("manual") || + normalizedIntent.includes("repair") || + normalizedIntent.includes("parts") + ) { + return hasManualHints + } + + if (looksLikeModelTokenQuery(normalizedQuery)) { + return true + } + + return hasManualHints +} + +export function formatManualContextForPrompt( + result: RetrieveManualContextResult +) { + const lines = [ + "Manual knowledge context:", + result.topChunks.length > 0 + ? "- Use only the excerpts below for any manuals, parts, or troubleshooting reply." + : "- No reliable manual excerpt was found.", + result.isRisky + ? "- The question looks technical or risky. Stay high-level and safe, and do not provide procedural repair steps." + : "- Stay limited to simple identification, likely issue category, and very basic safe checks.", + ] + + if (result.manualCandidates.length > 0) { + lines.push("Likely manual candidates:") + for (const candidate of result.manualCandidates.slice(0, 3)) { + lines.push( + `- ${buildManualLabel(candidate.filename, candidate.manufacturer)}` + ) + } + } + + if (result.topChunks.length === 0) { + lines.push( + "- Ask for the brand on the front, model sticker, or a clear photo/video, and offer texting it in for the team to review." + ) + return lines.join("\n") + } + + if (result.needsClarification) { + lines.push( + "- Confidence is limited. Ask for the brand on the front, the model sticker, or a clear photo/video before sounding certain." + ) + } + + lines.push("Grounded excerpts:") + for (const chunk of result.topChunks.slice(0, 3)) { + lines.push(`- ${chunk.citation}: ${truncateText(chunk.text, 420)}`) + } + + return lines.join("\n") +} + +class LocalManualKnowledgeProvider implements ManualKnowledgeProvider { + private artifactPromise: Promise | null = null + + async findManualCandidates(query: string) { + const artifact = await this.getArtifact() + const prepared = prepareQuery(query) + + return artifact.manuals + .map((manual) => buildCandidate(manual, scoreManual(manual, prepared))) + .filter((candidate) => candidate.score > 0) + .sort(compareByScore) + .slice(0, 6) + } + + async retrieveManualContext( + query: string, + filters?: ManualKnowledgeFilters + ): Promise { + const artifact = await this.getArtifact() + const prepared = prepareQuery(query, filters) + + const manualCandidates = artifact.manuals + .map((manual) => { + const score = scoreManual(manual, prepared) + return { manual, score } + }) + .filter((entry) => entry.score > 0) + .sort((left, right) => right.score - left.score) + + const candidateIds = new Set( + manualCandidates.slice(0, 10).map((entry) => entry.manual.manualId) + ) + + const topChunks = artifact.chunks + .map((chunk) => { + const manualScore = + manualCandidates.find((entry) => entry.manual.manualId === chunk.manualId) + ?.score ?? 0 + + const score = scoreChunk( + chunk, + prepared, + candidateIds.size > 0 && candidateIds.has(chunk.manualId) + ? manualScore + : manualScore * 0.35 + ) + + return { chunk, score } + }) + .filter((entry) => entry.score > 10) + .sort((left, right) => right.score - left.score) + .slice(0, 5) + .map(({ chunk, score }) => ({ + ...stripChunkForResponse(chunk), + matchScore: score, + })) + + const needsClarification = + topChunks.length === 0 || + topChunks[0].matchScore < 24 || + (manualCandidates[0] + ? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score) + .confidence < 0.48 + : true) + + const bestManualFromChunks = + topChunks.length > 0 + ? manualCandidates.find( + (entry) => entry.manual.manualId === topChunks[0].manualId + ) + : null + const bestManual = bestManualFromChunks + ? buildCandidate(bestManualFromChunks.manual, bestManualFromChunks.score) + : manualCandidates.length > 0 + ? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score) + : null + + return { + query, + bestManual, + manualCandidates: manualCandidates + .slice(0, 4) + .map(({ manual, score }) => buildCandidate(manual, score)), + topChunks, + needsClarification, + isRisky: isRiskyManualQuery(query), + } + } + + async getManualCitationContext(manualId: string, pageNumber?: number) { + const artifact = await this.getArtifact() + const manual = artifact.manualById.get(manualId) || null + const chunks = artifact.chunksByManualId.get(manualId) || [] + const citations = chunks + .filter((chunk) => + typeof pageNumber === "number" ? chunk.pageNumber === pageNumber : true + ) + .slice(0, 5) + .map(stripChunkForResponse) + + return { + manual: manual ? buildCandidate(manual, 1) : null, + citations, + } + } + + private async getArtifact() { + if (!this.artifactPromise) { + this.artifactPromise = buildArtifact() + } + + return await this.artifactPromise + } + + resetCache() { + this.artifactPromise = null + } +} + +async function buildArtifact(): Promise { + const sharedArtifact = await loadSharedPlatformArtifact() + if (sharedArtifact) { + return sharedArtifact + } + + const manuals = await loadManualCatalog() + const manualById = new Map() + + for (const manual of manuals) { + manualById.set(manual.manualId, manual) + } + + const chunks: ChunkInternal[] = [] + const chunksByManualId = new Map() + const extractedByFilename = await loadExtractedContentByFilename() + const structuredRecords = await loadStructuredManualRecords() + const manualsWithStructuredChunks = new Set() + + for (const record of structuredRecords) { + const matchedManual = matchStructuredRecordToManual(record, manuals) + const manual = ensureManualEntry({ + manualById, + manuals, + matchedManual, + fallbackManualId: normalizeIdentifier(record.manualId || "unknown-manual"), + filename: matchedManual?.filename || `${record.manualId || "manual"}.pdf`, + manufacturer: + matchedManual?.manufacturer || + humanizeToken(record.manufacturer || "Unknown"), + category: matchedManual?.category || record.manualType || "manual", + model: normalizeNullable(record.model), + manualType: normalizeManualType(record.manualType), + manualUrl: matchedManual?.manualUrl || null, + thumbnailUrl: matchedManual?.thumbnailUrl || null, + }) + + const structuredChunks = buildStructuredChunks(record, manual) + if (structuredChunks.length > 0) { + manualsWithStructuredChunks.add(manual.manualId) + addChunks(chunks, chunksByManualId, structuredChunks) + } + } + + for (const extracted of extractedByFilename.values()) { + const manual = matchExtractedRecordToManual(extracted, manuals) + const sourceManual = manual + ? ensureManualEntry({ + manualById, + manuals, + matchedManual: manual, + fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "")), + filename: manual.filename, + manufacturer: manual.manufacturer, + category: manual.category, + model: manual.model, + manualType: manual.manualType, + manualUrl: manual.manualUrl, + thumbnailUrl: manual.thumbnailUrl, + }) + : ensureManualEntry({ + manualById, + manuals, + matchedManual: null, + fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "manual")), + filename: extracted.filename || "manual.pdf", + manufacturer: "Unknown", + category: "manual", + model: null, + manualType: "manual", + manualUrl: null, + thumbnailUrl: null, + }) + + if (manualsWithStructuredChunks.has(sourceManual.manualId)) { + continue + } + + const fallbackChunks = buildExtractedFallbackChunks(extracted, sourceManual) + if (fallbackChunks.length > 0) { + addChunks(chunks, chunksByManualId, fallbackChunks) + } + } + + return { + manuals, + manualById, + chunks, + chunksByManualId, + } +} + +async function loadSharedPlatformArtifact(): Promise { + const platformRoot = resolveManualsPlatformRoot() + if (!platformRoot) { + return null + } + + const tenantId = getManualsPlatformTenantId() + const tenantDir = join(platformRoot, "output", "tenants", tenantId) + const manualsPath = join(tenantDir, "manuals.json") + const chunksPath = join(tenantDir, "chunks.json") + + if (!existsSync(manualsPath) || !existsSync(chunksPath)) { + return null + } + + const [manualRecords, chunkRecords] = await Promise.all([ + readJsonFile(manualsPath), + readJsonFile(chunksPath), + ]) + + const manuals: ManualInternal[] = manualRecords.map((manual) => { + const searchTerms = [ + manual.filename, + manual.sourcePath, + manual.manufacturer, + manual.category, + manual.model, + manual.manualType, + ...(manual.sourceFilenames || []), + ] + .filter(Boolean) + .join(" ") + + return { + manualId: manual.manualId, + filename: manual.filename, + manufacturer: manual.manufacturer, + category: manual.category || manual.manualType || "manual", + model: manual.model || null, + manualType: normalizeManualType(manual.manualType), + manualUrl: toSiteManualUrl(manual.manualUrl || manual.sourcePath || null), + thumbnailUrl: toSiteThumbnailUrl(manual.thumbnailUrl || null), + searchText: normalizeText(searchTerms), + tokenSet: new Set(tokenize(searchTerms)), + normalizedManufacturer: canonicalManufacturer(manual.manufacturer), + } + }) + + const manualById = new Map() + for (const manual of manuals) { + manualById.set(manual.manualId, manual) + } + + const chunks: ChunkInternal[] = chunkRecords.map((chunk) => { + const manual = manualById.get(chunk.manualId) || null + const filename = manual?.filename || humanizeToken(chunk.title || chunk.manualId) + const manufacturer = manual?.manufacturer || chunk.manufacturer + const manualUrl = manual?.manualUrl || toSiteManualUrl(chunk.manualUrl || null) + const thumbnailUrl = + manual?.thumbnailUrl || toSiteThumbnailUrl(chunk.thumbnailUrl || null) + const searchText = normalizeText( + [ + filename, + manufacturer, + chunk.model, + chunk.sectionTitle, + chunk.text, + chunk.manualType, + ] + .filter(Boolean) + .join(" ") + ) + + return { + manualId: chunk.manualId, + filename, + manufacturer, + model: chunk.model, + manualType: normalizeManualType(chunk.manualType), + pageNumber: chunk.pageNumber, + sectionTitle: chunk.sectionTitle, + text: chunk.text, + manualUrl, + thumbnailUrl, + sourceConfidence: clamp( + chunk.metadataConfidence ?? chunk.overallQuality ?? 0.76 + ), + matchScore: 0, + citation: buildCitation(filename, manufacturer, chunk.pageNumber), + searchText, + tokenSet: new Set(tokenize(searchText)), + } + }) + + const chunksByManualId = new Map() + for (const chunk of chunks) { + const existing = chunksByManualId.get(chunk.manualId) || [] + existing.push(chunk) + chunksByManualId.set(chunk.manualId, existing) + } + + return { + manuals, + manualById, + chunks, + chunksByManualId, + } +} + +function resolveManualsPlatformRoot() { + const candidates = [ + process.env.MANUALS_PLATFORM_ROOT, + join(process.cwd(), "..", "manuals-platform"), + join(process.cwd(), "manuals-platform"), + ].filter(Boolean) as string[] + + for (const candidate of candidates) { + if (existsSync(candidate)) { + return candidate + } + } + + return null +} + +function getManualsPlatformTenantId() { + const value = + process.env.MANUALS_PLATFORM_TENANT_ID || + process.env.SITE_MANUALS_TENANT_ID || + DEFAULT_MANUALS_PLATFORM_TENANT_ID + + return value.trim() || DEFAULT_MANUALS_PLATFORM_TENANT_ID +} + +function toSiteManualUrl(value: string | null) { + if (!value) { + return null + } + + if (/^https?:\/\//i.test(value)) { + return value + } + + const relativePath = value.replace(/^manuals\//i, "").replace(/^\/manuals\//i, "") + return buildManualAssetUrl(relativePath) +} + +function toSiteThumbnailUrl(value: string | null) { + if (!value) { + return null + } + + if (/^https?:\/\//i.test(value)) { + return value + } + + const relativePath = value + .replace(/^thumbnails\//i, "") + .replace(/^\/thumbnails\//i, "") + return buildThumbnailAssetUrl(relativePath) +} + +async function loadManualCatalog() { + const convexManuals = await listConvexManuals().catch(() => [] as Manual[]) + const filesystemManuals = + convexManuals.length > 0 ? [] : await scanManuals().catch(() => [] as Manual[]) + + const sourceManuals = convexManuals.length > 0 ? convexManuals : filesystemManuals + const deduped = new Map() + + for (const manual of sourceManuals) { + const manualId = normalizeIdentifier(manual.path || manual.filename) + const manufacturer = manual.manufacturer || "Unknown" + const filename = manual.filename || basename(manual.path) + const category = manual.category || "manual" + const model = guessModelFromManual(manual) + const searchTerms = [ + filename, + stripExtension(filename), + manual.path, + manufacturer, + category, + ...(manual.searchTerms || []), + ...(manual.commonNames || []), + ...aliasesForManufacturer(manufacturer), + model || "", + ] + .filter(Boolean) + .join(" ") + + deduped.set(manualId, { + manualId, + filename, + manufacturer, + category, + model, + manualType: normalizeManualType(category), + manualUrl: getManualUrl(manual), + thumbnailUrl: getThumbnailUrl(manual), + searchText: normalizeText(searchTerms), + tokenSet: new Set(tokenize(searchTerms)), + normalizedManufacturer: canonicalManufacturer(manufacturer), + }) + } + + return Array.from(deduped.values()) +} + +async function loadStructuredManualRecords() { + const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR) + const entries = await readdir(directory, { withFileTypes: true }) + const files = entries + .filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json")) + .map((entry) => entry.name) + + return await Promise.all( + files.map(async (filename) => { + const path = join(directory, filename) + const parsed = await readJsonFile(path) + return { + ...parsed, + manualId: parsed.manualId || stripExtension(filename), + } + }) + ) +} + +async function loadExtractedContentByFilename() { + const path = join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE) + const records = await readJsonFile(path) + const map = new Map() + + for (const record of records) { + if (!record.filename) { + continue + } + + map.set(normalizeIdentifier(record.filename), record) + } + + return map +} + +function matchStructuredRecordToManual( + record: StructuredManualRecord, + manuals: ManualInternal[] +) { + const query = [ + record.manualId, + record.manufacturer, + record.model, + record.manualType, + ] + .filter(Boolean) + .join(" ") + + const prepared = prepareQuery(query, { + manufacturer: record.manufacturer, + model: record.model, + }) + + const best = manuals + .map((manual) => ({ + manual, + score: scoreManual(manual, prepared), + })) + .sort((left, right) => right.score - left.score)[0] + + return best && best.score >= 12 ? best.manual : null +} + +function matchExtractedRecordToManual( + record: ExtractedManualRecord, + manuals: ManualInternal[] +) { + const filename = record.filename || "" + const normalizedFilename = normalizeIdentifier(filename) + const exactMatch = manuals.find( + (manual) => + normalizeIdentifier(manual.filename) === normalizedFilename || + normalizeIdentifier(manual.manualId) === normalizedFilename + ) + + if (exactMatch) { + return exactMatch + } + + const query = stripExtension(filename) + const prepared = prepareQuery(query) + const best = manuals + .map((manual) => ({ + manual, + score: scoreManual(manual, prepared), + })) + .sort((left, right) => right.score - left.score)[0] + + return best && best.score >= 10 ? best.manual : null +} + +function buildStructuredChunks( + record: StructuredManualRecord, + manual: ManualInternal +) { + const chunks: ChunkInternal[] = [] + const sections = record.content?.sections || [] + const troubleshooting = record.content?.troubleshooting || [] + const parts = record.content?.partsDatabase || [] + + for (const section of sections) { + const text = cleanChunkText(section.text || "") + if (!isUsefulChunkText(text)) { + continue + } + + chunks.push( + makeChunk({ + manual, + pageNumber: section.pageNumber ?? null, + sectionTitle: cleanSectionTitle(section.title), + text, + sourceConfidence: 0.92, + }) + ) + } + + for (const item of troubleshooting) { + const problem = cleanChunkText(item.problem || "") + const solution = cleanChunkText(item.solution || "") + const text = cleanChunkText( + [problem ? `Problem: ${problem}` : "", solution ? `Likely cause or solution: ${solution}` : ""] + .filter(Boolean) + .join("\n") + ) + + if (!isUsefulChunkText(text)) { + continue + } + + chunks.push( + makeChunk({ + manual, + pageNumber: item.pageNumber ?? null, + sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting", + text, + sourceConfidence: 0.95, + }) + ) + } + + const partChunks = buildPartChunks(parts, manual, 0.8) + addChunks(chunks, new Map(), partChunks) + + return chunks +} + +function buildExtractedFallbackChunks( + record: ExtractedManualRecord, + manual: ManualInternal +) { + const chunks: ChunkInternal[] = [] + const pages = record.text?.pages || [] + + for (const page of pages) { + const text = cleanChunkText(page.text || "") + if (!isUsefulChunkText(text)) { + continue + } + + chunks.push( + makeChunk({ + manual, + pageNumber: page.pageNumber ?? null, + sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "Manual page", + text, + sourceConfidence: 0.72, + }) + ) + } + + const partChunks = buildExtractedPartListChunks(record.partsLists || [], manual) + addChunks(chunks, new Map(), partChunks) + + return chunks +} + +function buildPartChunks( + parts: StructuredPart[], + manual: ManualInternal, + sourceConfidence: number +) { + const partsByPage = new Map() + + for (const part of parts) { + const partNumber = cleanChunkText(part.partNumber || "") + const description = cleanChunkText(part.description || "") + if (partNumber.length < 2 && description.length < 6) { + continue + } + + const pageNumber = part.pageNumber ?? 0 + const summary = description + ? `Part ${partNumber}: ${description}` + : `Part ${partNumber}` + + const bucket = partsByPage.get(pageNumber) || [] + if (bucket.length < 8) { + bucket.push(summary) + partsByPage.set(pageNumber, bucket) + } + } + + return Array.from(partsByPage.entries()).map(([pageNumber, summaries]) => + makeChunk({ + manual, + pageNumber: pageNumber || null, + sectionTitle: "Parts reference", + text: summaries.join("\n"), + sourceConfidence, + }) + ) +} + +function buildExtractedPartListChunks( + partLists: ExtractedPartList[], + manual: ManualInternal +) { + const flattened: StructuredPart[] = [] + + for (const partList of partLists) { + for (const part of partList.parts || []) { + flattened.push({ + ...part, + pageNumber: part.pageNumber ?? partList.pageNumber, + }) + } + } + + return buildPartChunks(flattened, manual, 0.76) +} + +function addChunks( + target: ChunkInternal[], + chunksByManualId: Map, + chunks: ChunkInternal[] +) { + for (const chunk of chunks) { + target.push(chunk) + + const existing = chunksByManualId.get(chunk.manualId) || [] + existing.push(chunk) + chunksByManualId.set(chunk.manualId, existing) + } +} + +function ensureManualEntry(args: { + manualById: Map + manuals: ManualInternal[] + matchedManual: ManualInternal | null + fallbackManualId: string + filename: string + manufacturer: string + category: string + model: string | null + manualType: string + manualUrl: string | null + thumbnailUrl: string | null +}) { + if (args.matchedManual) { + return args.matchedManual + } + + const manualId = normalizeIdentifier(args.fallbackManualId) + const existing = args.manualById.get(manualId) + if (existing) { + return existing + } + + const fallback: ManualInternal = { + manualId, + filename: args.filename, + manufacturer: args.manufacturer, + category: args.category, + model: args.model, + manualType: args.manualType, + manualUrl: args.manualUrl, + thumbnailUrl: args.thumbnailUrl, + searchText: normalizeText( + [ + args.filename, + args.manufacturer, + args.category, + args.model, + args.manualType, + ] + .filter(Boolean) + .join(" ") + ), + tokenSet: new Set( + tokenize( + [ + args.filename, + args.manufacturer, + args.category, + args.model, + args.manualType, + ] + .filter(Boolean) + .join(" ") + ) + ), + normalizedManufacturer: canonicalManufacturer(args.manufacturer), + } + + args.manuals.push(fallback) + args.manualById.set(fallback.manualId, fallback) + return fallback +} + +function prepareQuery(query: string, filters?: ManualKnowledgeFilters): PreparedQuery { + const normalized = normalizeText(query) + const tokenSet = new Set(tokenize(normalized)) + const modelTokens = extractModelTokens(normalized) + const manufacturerFilters = new Set() + + for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) { + for (const alias of aliases) { + if (normalized.includes(normalizeText(alias))) { + manufacturerFilters.add(canonical) + } + } + } + + if (filters?.manufacturer) { + manufacturerFilters.add(canonicalManufacturer(filters.manufacturer)) + } + + if (filters?.model) { + tokenSet.add(normalizeText(filters.model)) + } + + return { + normalized, + tokens: Array.from(tokenSet), + modelTokens, + manufacturerFilters: Array.from(manufacturerFilters), + manualIdFilter: filters?.manualId + ? normalizeIdentifier(filters.manualId) + : null, + } +} + +function scoreManual(manual: ManualInternal, query: PreparedQuery) { + if (!query.normalized) { + return 0 + } + + let score = 0 + + if (query.manualIdFilter) { + score += query.manualIdFilter === manual.manualId ? 28 : -12 + } + + if (query.manufacturerFilters.length > 0) { + score += query.manufacturerFilters.includes(manual.normalizedManufacturer) + ? 14 + : -6 + } + + if (manual.searchText.includes(query.normalized)) { + score += 24 + } + + for (const token of query.tokens) { + if (!token) { + continue + } + + if (manual.tokenSet.has(token)) { + score += query.modelTokens.includes(token) ? 8 : 4 + continue + } + + if (token.length >= 4 && manual.searchText.includes(token)) { + score += 1.5 + } + } + + for (const token of query.modelTokens) { + if (manual.searchText.includes(token)) { + score += 8 + } + } + + if ( + query.normalized.includes("error") || + query.normalized.includes("not ") || + query.normalized.includes("coin") || + query.normalized.includes("bill") + ) { + if ( + manual.manualType.includes("repair") || + manual.manualType.includes("service") || + manual.manualType.includes("parts") + ) { + score += 6 + } + } + + return score +} + +function scoreChunk( + chunk: ChunkInternal, + query: PreparedQuery, + manualScore: number +) { + if (!query.normalized) { + return manualScore + } + + let score = manualScore + chunk.sourceConfidence * 8 + + if (query.manufacturerFilters.length > 0) { + score += query.manufacturerFilters.includes(canonicalManufacturer(chunk.manufacturer)) + ? 6 + : -4 + } + + if (chunk.searchText.includes(query.normalized)) { + score += 18 + } + + for (const token of query.tokens) { + if (!token) { + continue + } + + if (chunk.tokenSet.has(token)) { + score += query.modelTokens.includes(token) ? 7 : 3 + continue + } + + if (token.length >= 5 && chunk.searchText.includes(token)) { + score += 1 + } + } + + if ( + query.normalized.includes("parts") || + query.normalized.includes("part") || + query.normalized.includes("bill") || + query.normalized.includes("coin") + ) { + if ((chunk.sectionTitle || "").toLowerCase().includes("parts")) { + score += 6 + } + } + + if ( + query.normalized.includes("error") || + query.normalized.includes("not ") || + query.normalized.includes("won t") || + query.normalized.includes("wont") + ) { + if ((chunk.sectionTitle || "").toLowerCase().includes("troubleshooting")) { + score += 5 + } + } + + return score +} + +function buildCandidate(manual: ManualInternal, score: number): ManualCandidate { + return { + manualId: manual.manualId, + filename: manual.filename, + manufacturer: manual.manufacturer, + category: manual.category, + manualUrl: manual.manualUrl, + thumbnailUrl: manual.thumbnailUrl, + score, + confidence: clamp(score / 38), + } +} + +function makeChunk(args: { + manual: ManualInternal + pageNumber: number | null + sectionTitle: string | null + text: string + sourceConfidence: number +}) { + const searchText = normalizeText( + [ + args.manual.filename, + args.manual.manufacturer, + args.manual.model, + args.sectionTitle, + args.text, + ] + .filter(Boolean) + .join(" ") + ) + + return { + manualId: args.manual.manualId, + filename: args.manual.filename, + manufacturer: args.manual.manufacturer, + model: args.manual.model, + manualType: args.manual.manualType, + pageNumber: args.pageNumber, + sectionTitle: args.sectionTitle, + text: args.text, + manualUrl: args.manual.manualUrl, + thumbnailUrl: args.manual.thumbnailUrl, + sourceConfidence: args.sourceConfidence, + matchScore: 0, + citation: buildCitation( + args.manual.filename, + args.manual.manufacturer, + args.pageNumber + ), + searchText, + tokenSet: new Set(tokenize(searchText)), + } satisfies ChunkInternal +} + +function stripChunkForResponse(chunk: ChunkInternal): ManualKnowledgeChunk { + return { + manualId: chunk.manualId, + filename: chunk.filename, + manufacturer: chunk.manufacturer, + model: chunk.model, + manualType: chunk.manualType, + pageNumber: chunk.pageNumber, + sectionTitle: chunk.sectionTitle, + text: chunk.text, + manualUrl: chunk.manualUrl, + thumbnailUrl: chunk.thumbnailUrl, + sourceConfidence: chunk.sourceConfidence, + matchScore: chunk.matchScore, + citation: chunk.citation, + } +} + +function cleanChunkText(value: string) { + return value + .replace(/\u00ad/g, "") + .replace(/\s+/g, " ") + .trim() +} + +function cleanSectionTitle(value?: string) { + const cleaned = cleanChunkText(value || "") + return cleaned || null +} + +function isUsefulChunkText(text: string) { + if (!text) { + return false + } + + if (text.length < 70) { + return /error|vend|coin|bill|cool|stuck|motor|sensor|jam|door|part/i.test( + text + ) + } + + const alphaChars = text.replace(/[^a-z]/gi, "").length + if (alphaChars < 25) { + return false + } + + if (/^(table of contents|contact us)$/i.test(text)) { + return false + } + + return true +} + +function normalizeText(value: string) { + return value + .normalize("NFKD") + .replace(/[^a-zA-Z0-9]+/g, " ") + .toLowerCase() + .trim() +} + +function tokenize(value: string) { + return normalizeText(value) + .split(" ") + .map((token) => token.trim()) + .filter( + (token) => + token.length > 1 && + !STOPWORDS.has(token) && + !/^page\d*$/.test(token) + ) +} + +function normalizeIdentifier(value: string) { + return normalizeText(stripExtension(value)).replace(/\s+/g, "-") +} + +function stripExtension(value: string) { + return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "") +} + +function canonicalManufacturer(value: string) { + const normalized = normalizeText(value) + + for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) { + if ( + canonical === normalized || + aliases.some((alias) => normalized.includes(normalizeText(alias))) + ) { + return canonical + } + } + + return normalized +} + +function aliasesForManufacturer(value: string) { + const canonical = canonicalManufacturer(value) + return MANUFACTURER_ALIASES[canonical] || [value] +} + +function guessModelFromManual(manual: Manual) { + const filename = normalizeText(stripExtension(manual.filename || "")) + const modelToken = extractModelTokens(filename)[0] + return modelToken || null +} + +function normalizeManualType(value?: string | null) { + const normalized = normalizeText(value || "") + if (!normalized) { + return "manual" + } + + if (normalized.includes("part")) { + return "parts" + } + + if (normalized.includes("operator")) { + return "operator" + } + + if (normalized.includes("service")) { + return "service" + } + + return normalized.replace(/\s+/g, "-") +} + +function normalizeNullable(value?: string | null) { + const normalized = cleanChunkText(value || "") + return normalized || null +} + +function extractModelTokens(value: string) { + const matches = + normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9]*\b/g) || [] + return Array.from(new Set(matches)) +} + +function looksLikeModelTokenQuery(value: string) { + return extractModelTokens(value).length > 0 +} + +function isRiskyManualQuery(value: string) { + const normalized = normalizeText(value) + return RISKY_MANUAL_KEYWORDS.some((keyword) => + normalized.includes(normalizeText(keyword)) + ) +} + +function buildCitation( + filename: string, + manufacturer: string, + pageNumber: number | null +) { + return `${buildManualLabel(filename, manufacturer)}${ + pageNumber ? `, page ${pageNumber}` : "" + }` +} + +function buildManualLabel(filename: string, manufacturer: string) { + const stem = humanizeToken(stripExtension(filename)) + const prefix = manufacturer ? `${manufacturer} ` : "" + return `${prefix}${stem}`.trim() +} + +function humanizeToken(value: string) { + return value + .replace(/[-_]+/g, " ") + .replace(/\s+/g, " ") + .trim() +} + +function compareByScore(left: ManualCandidate, right: ManualCandidate) { + return right.score - left.score +} + +function clamp(value: number) { + return Math.max(0, Math.min(1, value)) +} + +function truncateText(value: string, maxLength: number) { + if (value.length <= maxLength) { + return value + } + + return `${value.slice(0, maxLength - 1).trimEnd()}…` +} + +async function readJsonFile(path: string) { + return JSON.parse(await readFile(path, "utf8")) as T +} diff --git a/lib/manuals-qdrant-corpus.test.ts b/lib/manuals-qdrant-corpus.test.ts new file mode 100644 index 00000000..cfc1b860 --- /dev/null +++ b/lib/manuals-qdrant-corpus.test.ts @@ -0,0 +1,114 @@ +import assert from "node:assert/strict" +import test from "node:test" +import { + evaluateManualsQdrantCorpus, + getDefaultManualsQdrantEvaluationCases, + getManualsQdrantCorpus, + resetManualsQdrantCorpusCache, + searchManualsQdrantCorpus, +} from "@/lib/manuals-qdrant-corpus" + +const corpusPromise = getManualsQdrantCorpus() + +test.after(() => { + resetManualsQdrantCorpusCache() +}) + +test("manuals qdrant corpus builds from the full structured and extracted datasets", async () => { + const corpus = await corpusPromise + + assert.equal(corpus.stats.structuredRecords, 497) + assert.equal(corpus.stats.extractedRecords, 497) + assert.equal(corpus.stats.chunkCount > 20000, true) + assert.equal(corpus.stats.highConfidenceChunks > corpus.stats.fallbackChunks, true) + assert.equal(corpus.manuals.some((manual) => manual.manualId === "unknown-unknown-manual"), false) +}) + +test("canonical manufacturers cover core vending families after normalization", async () => { + const corpus = await corpusPromise + + const manufacturers = new Set(corpus.manuals.map((manual) => manual.manufacturer)) + + assert.equal(manufacturers.has("Royal Vendors"), true) + assert.equal(manufacturers.has("Dixie-Narco"), true) + assert.equal(manufacturers.has("Crane"), true) + assert.equal(manufacturers.has("AP"), true) + assert.equal(manufacturers.has("Coinco"), true) + assert.equal(manufacturers.has("Other"), true) +}) + +test("fault queries prefer troubleshooting over brochure content", async () => { + const corpus = await corpusPromise + const results = searchManualsQdrantCorpus( + corpus, + "Royal machine not accepting coins", + { + profile: "public_safe", + limit: 5, + } + ) + + assert.equal(results.length > 0, true) + assert.equal(results[0]?.chunk.labels.includes("troubleshooting"), true) + assert.equal(results[0]?.chunk.labels.includes("brochure"), false) + assert.equal(results.some((result) => result.chunk.labels.includes("brochure")), false) +}) + +test("public-safe profile filters risky wiring chunks while internal-tech keeps them available", async () => { + const corpus = await corpusPromise + const publicResults = searchManualsQdrantCorpus( + corpus, + "Royal wiring diagram voltage issue", + { + profile: "public_safe", + limit: 5, + } + ) + const internalResults = searchManualsQdrantCorpus( + corpus, + "Royal wiring diagram voltage issue", + { + profile: "internal_tech", + limit: 5, + } + ) + + assert.equal( + publicResults.some((result) => result.chunk.labels.includes("wiring")), + false + ) + assert.equal( + internalResults.some((result) => result.chunk.labels.includes("wiring")), + true + ) +}) + +test("default evaluation set passes before the corpus is treated as production-ready", async () => { + const corpus = await corpusPromise + const evaluation = evaluateManualsQdrantCorpus( + corpus, + getDefaultManualsQdrantEvaluationCases() + ) + + assert.equal(evaluation.summary.totalCases, 6) + assert.equal( + evaluation.cases.every( + (entry) => + entry.passedTop3Manufacturer !== false && + entry.passedTop5Label && + entry.passedDisallowedCheck + ), + true + ) +}) + +test("manuals qdrant corpus cache can be rebuilt on demand", async () => { + const firstCorpus = await getManualsQdrantCorpus() + + resetManualsQdrantCorpusCache() + + const secondCorpus = await getManualsQdrantCorpus() + + assert.notEqual(firstCorpus, secondCorpus) + assert.equal(secondCorpus.stats.structuredRecords, 497) +}) diff --git a/lib/manuals-qdrant-corpus.ts b/lib/manuals-qdrant-corpus.ts new file mode 100644 index 00000000..b0d10d35 --- /dev/null +++ b/lib/manuals-qdrant-corpus.ts @@ -0,0 +1,1734 @@ +import { mkdir, readFile, readdir, writeFile } from "node:fs/promises" +import { basename, join } from "node:path" +import { scanManuals } from "@/lib/manuals" +import { getManualsDataRoot } from "@/lib/manuals-paths" +import type { Manual } from "@/lib/manuals-types" +import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types" + +const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"] +const STRUCTURED_MANUALS_DIR = [ + ...MANUALS_OPTIMIZED_ROOT, + "training_data", + "structured", +] +const STRUCTURED_MANUALS_INDEX_FILE = [ + ...MANUALS_OPTIMIZED_ROOT, + "training_data", + "all_structured_data.json", +] +const EXTRACTED_CONTENT_FILE = [ + ...MANUALS_OPTIMIZED_ROOT, + "extracted_content", + "manuals_extracted_content.json", +] + +const STOPWORDS = new Set([ + "a", + "an", + "and", + "are", + "at", + "be", + "by", + "for", + "from", + "in", + "is", + "it", + "manual", + "machine", + "of", + "on", + "or", + "service", + "the", + "to", + "with", +]) + +const TECH_RISK_KEYWORDS = [ + "wiring", + "diagram", + "schematic", + "electrical", + "voltage", + "jumper", + "compressor", + "refrigerant", + "bypass", + "board level", +] + +const MARKETING_KEYWORDS = [ + "increase sales", + "more profits", + "contact us", + "operator can double up", + "your employees", + "productivity", + "variety", + "brochure", +] + +const SPECS_KEYWORDS = [ + "dimensions", + "height:", + "width:", + "depth:", + "shipping weight", + "electrical:", + "listings:", + "capacity", + "voltage", +] + +const TROUBLESHOOTING_KEYWORDS = [ + "probable cause", + "solution", + "troubleshooting", + "not accepting", + "will not vend", + "check fuse", + "error code", +] + +const OPERATOR_KEYWORDS = [ + "user guide", + "operators guide", + "operation", + "programming", + "setup guide", + "how to", +] + +const PARTS_KEYWORDS = [ + "parts manual", + "parts reference", + "part number", + "parts list", + "exploded view", +] + +const MANUFACTURER_ALIASES: Record = { + "AP": [ + "ap", + "automatic products", + "automatic-products", + "snackshop", + ], + "Other": ["other", "unknown", "bill mechs", "coin mechs"], + "Coinco": ["coinco"], + "Crane": [ + "crane", + "national vendors", + "national", + "merchant", + "merchant series", + ], + "Dixie-Narco": ["dixie", "narco", "dixie narco", "dixie-narco", "bevmax"], + "GPL": ["gpl", "general products"], + "MEI Mars": ["mei", "mars", "bill validator"], + "Royal Vendors": ["royal", "royal vendors", "royal vendor", "rvv"], + "Rowe": ["rowe"], + "Seaga": ["seaga"], + "USI": ["usi", "u select it", "u-select-it", "uselectit"], + "Vendo": ["vendo", "sanden"], +} + +export type ManualsQdrantProfile = "public_safe" | "internal_tech" + +export type ManualsQdrantChunkLabel = + | "brochure" + | "flowchart" + | "operator" + | "parts" + | "service" + | "specs" + | "toc" + | "troubleshooting" + | "wiring" + | "general" + +export type ManualsEmbeddingTier = + | "high_confidence" + | "fallback" + | "exclude" + +export type ManualsQdrantManual = { + manualId: string + title: string + manufacturer: string + manufacturerCanonical: string + model: string | null + manualType: string + category: string | null + manualUrl: string | null + thumbnailUrl: string | null + sourceFilenames: string[] + sourceRecordCount: number + metadataConfidence: number + parseQuality: number + duplicateRisk: number + chunkCount: number + highConfidenceChunkCount: number + profiles: ManualsQdrantProfile[] + embeddingTier: ManualsEmbeddingTier + flags: string[] +} + +export type ManualsQdrantChunk = { + chunkId: string + manualId: string + title: string + manufacturer: string + manufacturerCanonical: string + model: string | null + manualType: string + category: string | null + pageNumber: number | null + sectionTitle: string | null + text: string + sourceFilename: string | null + sourceKind: "ocr_page" | "parts_database" | "structured_section" | "troubleshooting" + labels: ManualsQdrantChunkLabel[] + manualUrl: string | null + thumbnailUrl: string | null + metadataConfidence: number + textQuality: number + overallQuality: number + embeddingTier: ManualsEmbeddingTier + profiles: ManualsQdrantProfile[] + isRisky: boolean + flags: string[] +} + +export type ManualsQdrantCorpusStats = { + catalogManuals: number + structuredRecords: number + extractedRecords: number + normalizedManuals: number + chunkCount: number + highConfidenceChunks: number + fallbackChunks: number + excludedChunks: number + manualsByManufacturer: Record + chunksByLabel: Record + profileCounts: Record +} + +export type ManualsQdrantCorpus = { + generatedAt: string + stats: ManualsQdrantCorpusStats + manuals: ManualsQdrantManual[] + chunks: ManualsQdrantChunk[] +} + +export type ManualsQdrantEvaluationCase = { + id: string + query: string + profile: ManualsQdrantProfile + expectedManufacturer?: string + expectedChunkLabels?: ManualsQdrantChunkLabel[] + disallowedChunkLabels?: ManualsQdrantChunkLabel[] +} + +export type ManualsQdrantSearchResult = { + chunk: ManualsQdrantChunk + score: number +} + +export type ManualsQdrantEvaluationResult = { + cases: Array<{ + id: string + query: string + profile: ManualsQdrantProfile + passedTop3Manufacturer: boolean | null + passedTop5Label: boolean + passedDisallowedCheck: boolean + topManufacturers: string[] + topLabels: ManualsQdrantChunkLabel[] + }> + summary: { + totalCases: number + top3ManufacturerPasses: number + labelPasses: number + disallowedPasses: number + } +} + +type StructuredSection = { + title?: string + pageNumber?: number + text?: string +} + +type StructuredTroubleshooting = { + problem?: string + solution?: string + pageNumber?: number +} + +type StructuredPart = { + partNumber?: string + description?: string + pageNumber?: number +} + +type StructuredManualRecord = { + manualId?: string + manufacturer?: string + model?: string + manualType?: string + sourceFilename?: string + metadata?: { + pageCount?: number + } + content?: { + sections?: StructuredSection[] + troubleshooting?: StructuredTroubleshooting[] + partsDatabase?: StructuredPart[] + specifications?: Record + } +} + +type ExtractedPage = { + pageNumber?: number + text?: string + wordCount?: number +} + +type ExtractedPartList = { + pageNumber?: number + parts?: StructuredPart[] +} + +type ExtractedManualRecord = { + filename?: string + filepath?: string + text?: { + fullText?: string + pages?: ExtractedPage[] + } + partsLists?: ExtractedPartList[] + sections?: StructuredSection[] +} + +type CatalogManual = { + manual: Manual + manufacturerCanonical: string + modelGuess: string | null + searchText: string + tokenSet: Set +} + +type ManualAccumulator = Omit< + ManualsQdrantManual, + | "metadataConfidence" + | "parseQuality" + | "duplicateRisk" + | "chunkCount" + | "highConfidenceChunkCount" + | "profiles" + | "embeddingTier" + | "flags" +> & { + metadataConfidenceTotal: number + metadataConfidenceSamples: number + hasStructured: boolean + hasTroubleshooting: boolean + hasOcrText: boolean + chunks: ManualsQdrantChunk[] + flagsSet: Set +} + +const DEFAULT_EVAL_CASES: ManualsQdrantEvaluationCase[] = [ + { + id: "rvv-660-service", + query: "RVV 660 service manual", + profile: "internal_tech", + expectedManufacturer: "Royal Vendors", + }, + { + id: "narco-bevmax-cooling", + query: "Narco bevmax not cooling", + profile: "public_safe", + expectedManufacturer: "Dixie-Narco", + expectedChunkLabels: ["service", "troubleshooting"], + }, + { + id: "coin-mech-dollars", + query: "coin mech not accepting dollars", + profile: "public_safe", + expectedChunkLabels: ["troubleshooting", "parts"], + disallowedChunkLabels: ["brochure"], + }, + { + id: "royal-coins", + query: "Royal machine not accepting coins", + profile: "public_safe", + expectedManufacturer: "Royal Vendors", + expectedChunkLabels: ["troubleshooting"], + }, + { + id: "wiring-risky", + query: "Royal wiring diagram voltage issue", + profile: "public_safe", + expectedManufacturer: "Royal Vendors", + disallowedChunkLabels: ["wiring"], + }, + { + id: "ambiguous-bad-query", + query: "manual for strange mystery vendor", + profile: "public_safe", + disallowedChunkLabels: ["brochure"], + }, +] + +export function getDefaultManualsQdrantEvaluationCases() { + return DEFAULT_EVAL_CASES +} + +let manualsQdrantCorpusPromise: Promise | null = null + +export function getManualsQdrantCorpus() { + if (!manualsQdrantCorpusPromise) { + manualsQdrantCorpusPromise = buildManualsQdrantCorpus() + } + + return manualsQdrantCorpusPromise +} + +export function resetManualsQdrantCorpusCache() { + manualsQdrantCorpusPromise = null +} + +export async function buildManualsQdrantCorpus(): Promise { + const catalogManuals = await loadCatalogManuals() + const structuredRecords = await loadStructuredRecords() + const extractedRecords = await loadExtractedRecords() + + const manuals = new Map() + const chunkDedup = new Set() + + for (const record of structuredRecords) { + const catalogMatch = matchCatalogManual( + [record.manualId, record.manufacturer, record.model] + .filter(Boolean) + .join(" "), + catalogManuals, + { + manufacturerHint: record.manufacturer || null, + modelHint: record.model || null, + } + ) + const filenameHint = + catalogMatch?.manual.filename || record.sourceFilename || `${record.manualId || "manual"}.pdf` + const manual = getOrCreateManualAccumulator({ + manuals, + catalogMatch, + filename: filenameHint, + recordManufacturer: record.manufacturer || null, + recordModel: record.model || null, + manualTypeHint: detectManualType( + `${record.manualType || ""} ${record.manualId || ""}` + ), + categoryHint: catalogMatch?.manual.category || null, + metadataConfidence: catalogMatch ? 0.86 : 0.32, + sourceRecordId: record.sourceFilename || record.manualId || filenameHint, + }) + + manual.hasStructured = true + manual.hasTroubleshooting ||= Boolean(record.content?.troubleshooting?.length) + + for (const chunk of buildStructuredChunks(record, manual)) { + addChunkToManual(manual, chunk, chunkDedup) + } + } + + for (const record of extractedRecords) { + const filename = record.filename || basename(record.filepath || "manual.pdf") + const catalogMatch = matchCatalogManual(filename, catalogManuals) + const manual = getOrCreateManualAccumulator({ + manuals, + catalogMatch, + filename, + recordManufacturer: null, + recordModel: null, + manualTypeHint: detectManualType(filename), + categoryHint: catalogMatch?.manual.category || null, + metadataConfidence: + catalogMatch && normalizeIdentifier(catalogMatch.manual.filename) === + normalizeIdentifier(filename) + ? 0.96 + : catalogMatch + ? 0.78 + : 0.36, + sourceRecordId: record.filename || record.filepath || "unknown-extracted", + }) + + manual.hasOcrText ||= hasUsefulOcrText(record) + + // Prefer structured/manual chunks where they exist and use OCR pages only + // as a fallback corpus for manuals we could not parse structurally. + if (manual.hasStructured) { + continue + } + + for (const chunk of buildExtractedChunks(record, manual)) { + addChunkToManual(manual, chunk, chunkDedup) + } + } + + const finalizedManuals = Array.from(manuals.values()) + .map(finalizeManual) + .sort((left, right) => left.manualId.localeCompare(right.manualId)) + const finalizedChunks = finalizedManuals + .flatMap((manual) => manual.chunks) + .sort((left, right) => left.chunkId.localeCompare(right.chunkId)) + + const stats = buildCorpusStats({ + catalogManuals, + structuredRecords, + extractedRecords, + manuals: finalizedManuals, + chunks: finalizedChunks, + }) + + return { + generatedAt: new Date().toISOString(), + stats, + manuals: finalizedManuals, + chunks: finalizedChunks, + } +} + +export function searchManualsQdrantCorpus( + corpus: ManualsQdrantCorpus, + query: string, + options?: { + limit?: number + profile?: ManualsQdrantProfile + } +): ManualsQdrantSearchResult[] { + const limit = options?.limit ?? 5 + const profile = options?.profile ?? "internal_tech" + const normalizedQuery = normalizeText(query) + const queryTokens = tokenize(normalizedQuery) + const queryLower = normalizedQuery.toLowerCase() + + return corpus.chunks + .filter((chunk) => { + return ( + chunk.embeddingTier !== "exclude" && + chunk.profiles.includes(profile) && + chunk.text.trim().length > 0 + ) + }) + .map((chunk) => ({ + chunk, + score: scoreChunkForQuery(chunk, queryTokens, queryLower), + })) + .filter((entry) => entry.score > 0) + .sort((left, right) => right.score - left.score) + .slice(0, limit) +} + +export function evaluateManualsQdrantCorpus( + corpus: ManualsQdrantCorpus, + cases: ManualsQdrantEvaluationCase[] = DEFAULT_EVAL_CASES +): ManualsQdrantEvaluationResult { + const results = cases.map((evaluationCase) => { + const topResults = searchManualsQdrantCorpus(corpus, evaluationCase.query, { + limit: 5, + profile: evaluationCase.profile, + }) + const topManufacturers = Array.from( + new Set(topResults.map((result) => result.chunk.manufacturer)) + ) + const topLabels = Array.from( + new Set(topResults.flatMap((result) => result.chunk.labels)) + ) + + return { + id: evaluationCase.id, + query: evaluationCase.query, + profile: evaluationCase.profile, + passedTop3Manufacturer: evaluationCase.expectedManufacturer + ? topManufacturers + .slice(0, 3) + .includes(evaluationCase.expectedManufacturer) + : null, + passedTop5Label: evaluationCase.expectedChunkLabels + ? evaluationCase.expectedChunkLabels.some((label) => + topLabels.includes(label) + ) + : true, + passedDisallowedCheck: evaluationCase.disallowedChunkLabels + ? !topLabels.some((label) => + evaluationCase.disallowedChunkLabels?.includes(label) + ) + : true, + topManufacturers, + topLabels, + } + }) + + return { + cases: results, + summary: { + totalCases: results.length, + top3ManufacturerPasses: results.filter( + (result) => result.passedTop3Manufacturer !== false + ).length, + labelPasses: results.filter((result) => result.passedTop5Label).length, + disallowedPasses: results.filter( + (result) => result.passedDisallowedCheck + ).length, + }, + } +} + +export async function writeManualsQdrantArtifacts(args?: { + outputDir?: string +}) { + const outputDir = args?.outputDir || join(process.cwd(), "output", "manuals-qdrant") + const corpus = await buildManualsQdrantCorpus() + const evaluation = evaluateManualsQdrantCorpus(corpus) + const internalTechChunks = corpus.chunks.filter((chunk) => + chunk.profiles.includes("internal_tech") + ) + const publicSafeChunks = corpus.chunks.filter((chunk) => + chunk.profiles.includes("public_safe") + ) + const highConfidenceChunks = corpus.chunks.filter( + (chunk) => chunk.embeddingTier === "high_confidence" + ) + + await mkdir(outputDir, { recursive: true }) + await writeFile( + join(outputDir, "summary.json"), + JSON.stringify( + { + generatedAt: corpus.generatedAt, + stats: corpus.stats, + evaluation: evaluation.summary, + }, + null, + 2 + ) + ) + await writeFile( + join(outputDir, "manuals.json"), + JSON.stringify(corpus.manuals, null, 2) + ) + await writeFile( + join(outputDir, "chunks.json"), + JSON.stringify(corpus.chunks, null, 2) + ) + await writeFile( + join(outputDir, "chunks-internal-tech.json"), + JSON.stringify(internalTechChunks, null, 2) + ) + await writeFile( + join(outputDir, "chunks-public-safe.json"), + JSON.stringify(publicSafeChunks, null, 2) + ) + await writeFile( + join(outputDir, "chunks-high-confidence.json"), + JSON.stringify(highConfidenceChunks, null, 2) + ) + await writeFile( + join(outputDir, "evaluation-cases.json"), + JSON.stringify(DEFAULT_EVAL_CASES, null, 2) + ) + await writeFile( + join(outputDir, "evaluation-report.json"), + JSON.stringify(evaluation, null, 2) + ) + + return { + outputDir, + corpus, + evaluation, + } +} + +async function loadCatalogManuals() { + const manuals = await scanManuals() + const catalog = manuals.map((manual) => { + const title = [ + manual.filename, + manual.manufacturer, + manual.category, + ...(manual.searchTerms || []), + ...(manual.commonNames || []), + ] + .filter(Boolean) + .join(" ") + + return { + manual, + manufacturerCanonical: normalizeManufacturer(manual.manufacturer), + modelGuess: extractModel(title), + searchText: normalizeText(title), + tokenSet: new Set(tokenize(title)), + } satisfies CatalogManual + }) + + return catalog +} + +async function loadStructuredRecords() { + const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR) + const entries = await readdir(directory, { withFileTypes: true }) + const files = entries + .filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json")) + .map((entry) => entry.name) + + const records = await Promise.all( + files.map(async (filename) => { + const parsed = await readJsonFile( + join(directory, filename) + ) + return { + ...parsed, + sourceFilename: filename, + manualId: parsed.manualId || stripExtension(filename), + } + }) + ) + + const indexRecords = await readJsonFile( + join(getManualsDataRoot(), ...STRUCTURED_MANUALS_INDEX_FILE) + ) + + if (indexRecords.length === 0) { + return records + } + + const recordsByKey = new Map() + + for (const record of records) { + const key = getStructuredRecordMatchKey(record) + const existing = recordsByKey.get(key) || [] + existing.push(record.sourceFilename || `${record.manualId || "structured"}.json`) + recordsByKey.set(key, existing) + } + + return indexRecords.map((record, index) => { + const key = getStructuredRecordMatchKey(record) + const matchingFilenames = recordsByKey.get(key) || [] + + return { + ...record, + sourceFilename: + matchingFilenames.shift() || + `${normalizeIdentifier(record.manualId || `structured-record-${index + 1}`)}.json`, + } + }) +} + +async function loadExtractedRecords() { + return await readJsonFile( + join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE) + ) +} + +function matchCatalogManual( + rawQuery: string, + catalogManuals: CatalogManual[], + hints?: { + manufacturerHint?: string | null + modelHint?: string | null + } +) { + const normalizedQuery = normalizeText(rawQuery) + const tokens = tokenize(normalizedQuery) + const manufacturerHint = hints?.manufacturerHint + ? normalizeManufacturer(hints.manufacturerHint) + : null + const modelHint = hints?.modelHint ? normalizeIdentifier(hints.modelHint) : null + + const exactStemMatch = catalogManuals.find( + (catalogManual) => + normalizeIdentifier(stripExtension(catalogManual.manual.filename)) === + normalizeIdentifier(stripExtension(rawQuery)) + ) + if (exactStemMatch) { + return exactStemMatch + } + + const scored = catalogManuals + .map((catalogManual) => { + let score = 0 + + if (manufacturerHint) { + score += + catalogManual.manufacturerCanonical === manufacturerHint ? 16 : -4 + } + + if (modelHint) { + if (catalogManual.modelGuess === modelHint) { + score += 14 + } else if ( + catalogManual.searchText.includes(modelHint.replace(/-/g, " ")) + ) { + score += 8 + } + } + + if ( + normalizedQuery && + catalogManual.searchText.includes(normalizedQuery.toLowerCase()) + ) { + score += 20 + } + + for (const token of tokens) { + if (catalogManual.tokenSet.has(token)) { + score += 4 + } else if ( + token.length >= 4 && + catalogManual.searchText.includes(token) + ) { + score += 1.5 + } + } + + return { catalogManual, score } + }) + .sort((left, right) => right.score - left.score) + + return scored[0] && scored[0].score >= 10 ? scored[0].catalogManual : null +} + +function getOrCreateManualAccumulator(args: { + manuals: Map + catalogMatch: CatalogManual | null + filename: string + recordManufacturer: string | null + recordModel: string | null + manualTypeHint: string + categoryHint: string | null + metadataConfidence: number + sourceRecordId: string +}) { + const manual = args.catalogMatch?.manual + const manufacturer = humanizeManufacturer( + manual?.manufacturer || args.recordManufacturer || "Other" + ) + const model = + args.recordModel && !isPlaceholderValue(args.recordModel) + ? sanitizeModel(args.recordModel) + : args.catalogMatch?.modelGuess || extractModel(args.filename) + const manualType = args.manualTypeHint || detectManualType(args.filename) + const manualId = buildCanonicalManualId({ + catalogManual: manual || null, + manufacturer, + model, + manualType, + filename: args.filename, + }) + const existing = args.manuals.get(manualId) + + if (existing) { + existing.sourceFilenames = Array.from( + new Set([ + ...existing.sourceFilenames, + args.filename, + ...(args.sourceRecordId.toLowerCase().endsWith(".json") + ? [args.sourceRecordId] + : []), + ]) + ) + existing.sourceRecordCount += 1 + existing.metadataConfidenceTotal += args.metadataConfidence + existing.metadataConfidenceSamples += 1 + if (args.categoryHint && !existing.category) { + existing.category = args.categoryHint + } + if (args.recordModel && !existing.model && !isPlaceholderValue(args.recordModel)) { + existing.model = sanitizeModel(args.recordModel) + } + if (args.recordManufacturer && existing.manufacturer === "Other") { + existing.manufacturer = humanizeManufacturer(args.recordManufacturer) + existing.manufacturerCanonical = normalizeManufacturer(existing.manufacturer) + } + existing.flagsSet.add( + args.sourceRecordId === manualId ? "merged-duplicate-source" : "merged-source" + ) + return existing + } + + const created: ManualAccumulator = { + manualId, + title: humanizeTitle(stripExtension(manual?.filename || args.filename)), + manufacturer, + manufacturerCanonical: normalizeManufacturer(manufacturer), + model: model || null, + manualType, + category: args.categoryHint, + manualUrl: manual ? getManualUrl(manual) : null, + thumbnailUrl: manual ? getThumbnailUrl(manual) : null, + sourceFilenames: Array.from( + new Set([ + args.filename, + ...(args.sourceRecordId.toLowerCase().endsWith(".json") + ? [args.sourceRecordId] + : []), + ]) + ), + sourceRecordCount: 1, + metadataConfidenceTotal: args.metadataConfidence, + metadataConfidenceSamples: 1, + flagsSet: new Set( + args.catalogMatch ? [] : ["catalog-match-missing"] + ), + hasStructured: false, + hasTroubleshooting: false, + hasOcrText: false, + chunks: [], + } + + if (!args.catalogMatch && isPlaceholderValue(args.recordManufacturer || "")) { + created.flagsSet.add("metadata-manufacturer-placeholder") + } + + args.manuals.set(manualId, created) + return created +} + +function buildStructuredChunks( + record: StructuredManualRecord, + manual: ManualAccumulator +) { + const chunks: ManualsQdrantChunk[] = [] + + for (const section of record.content?.sections || []) { + const text = cleanText(section.text || "") + if (!text) { + continue + } + chunks.push( + createChunk({ + manual, + text, + pageNumber: section.pageNumber ?? null, + sectionTitle: cleanText(section.title || "") || null, + sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, + sourceKind: "structured_section", + }) + ) + } + + for (const item of record.content?.troubleshooting || []) { + const problem = cleanText(item.problem || "") + const solution = cleanText(item.solution || "") + const text = cleanText( + [ + problem ? `Problem: ${problem}` : "", + solution ? `Likely cause or solution: ${solution}` : "", + ] + .filter(Boolean) + .join("\n") + ) + if (!text) { + continue + } + chunks.push( + createChunk({ + manual, + text, + pageNumber: item.pageNumber ?? null, + sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting", + sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, + sourceKind: "troubleshooting", + }) + ) + } + + const partsByPage = new Map() + for (const item of record.content?.partsDatabase || []) { + const partNumber = cleanText(item.partNumber || "") + const description = cleanText(item.description || "") + if (partNumber.length < 2 && description.length < 4) { + continue + } + const pageNumber = item.pageNumber ?? 0 + const parts = partsByPage.get(pageNumber) || [] + parts.push(description ? `Part ${partNumber}: ${description}` : `Part ${partNumber}`) + partsByPage.set(pageNumber, parts) + } + + for (const [pageNumber, parts] of partsByPage.entries()) { + chunks.push( + createChunk({ + manual, + text: parts.slice(0, 12).join("\n"), + pageNumber: pageNumber || null, + sectionTitle: "Parts reference", + sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, + sourceKind: "parts_database", + }) + ) + } + + if (record.content?.specifications) { + const specsText = cleanText( + Object.entries(record.content.specifications) + .map(([key, value]) => `${humanizeTitle(key)}: ${String(value)}`) + .join("\n") + ) + if (specsText) { + chunks.push( + createChunk({ + manual, + text: specsText, + pageNumber: null, + sectionTitle: "Specifications", + sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, + sourceKind: "structured_section", + }) + ) + } + } + + return chunks +} + +function buildExtractedChunks( + record: ExtractedManualRecord, + manual: ManualAccumulator +) { + const chunks: ManualsQdrantChunk[] = [] + + for (const page of record.text?.pages || []) { + const text = cleanText(page.text || "") + if (!text || (page.wordCount || 0) === 0) { + continue + } + + chunks.push( + createChunk({ + manual, + text, + pageNumber: page.pageNumber ?? null, + sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "OCR page", + sourceFilename: record.filename || null, + sourceKind: "ocr_page", + }) + ) + } + + for (const list of record.partsLists || []) { + const parts = (list.parts || []) + .map((part) => { + const partNumber = cleanText(part.partNumber || "") + const description = cleanText(part.description || "") + return description + ? `Part ${partNumber}: ${description}` + : partNumber + ? `Part ${partNumber}` + : "" + }) + .filter(Boolean) + .slice(0, 12) + + if (parts.length === 0) { + continue + } + + chunks.push( + createChunk({ + manual, + text: parts.join("\n"), + pageNumber: list.pageNumber ?? null, + sectionTitle: "Parts reference", + sourceFilename: record.filename || null, + sourceKind: "parts_database", + }) + ) + } + + return chunks +} + +function addChunkToManual( + manual: ManualAccumulator, + chunk: ManualsQdrantChunk, + chunkDedup: Set +) { + const dedupKey = [ + chunk.manualId, + chunk.pageNumber ?? "na", + normalizeIdentifier(chunk.sectionTitle || ""), + normalizeIdentifier(chunk.text.slice(0, 180)), + ].join("::") + + if (chunkDedup.has(dedupKey)) { + manual.flagsSet.add("duplicate-chunk-collapsed") + return + } + + chunkDedup.add(dedupKey) + manual.chunks.push(chunk) +} + +function finalizeManual(manual: ManualAccumulator): ManualsQdrantManual & { + chunks: ManualsQdrantChunk[] +} { + const metadataConfidence = clamp( + manual.metadataConfidenceTotal / manual.metadataConfidenceSamples + ) + const duplicateRisk = clamp((manual.sourceRecordCount - 1) / 4) + const highConfidenceChunkCount = manual.chunks.filter( + (chunk) => chunk.embeddingTier === "high_confidence" + ).length + const parseQuality = clamp( + metadataConfidence * 0.4 + + (manual.hasStructured ? 0.2 : 0) + + (manual.hasTroubleshooting ? 0.15 : 0) + + (manual.hasOcrText ? 0.1 : 0) + + clamp(highConfidenceChunkCount / 8) * 0.25 - + duplicateRisk * 0.15 + ) + const embeddingTier: ManualsEmbeddingTier = + parseQuality >= 0.72 && highConfidenceChunkCount > 0 + ? "high_confidence" + : parseQuality >= 0.46 && manual.chunks.length > 0 + ? "fallback" + : "exclude" + + const profiles = buildProfiles({ + labels: Array.from(new Set(manual.chunks.flatMap((chunk) => chunk.labels))), + embeddingTier, + overallQuality: parseQuality, + isRisky: manual.chunks.some((chunk) => chunk.isRisky), + }) + + const finalizedChunks = manual.chunks.map((chunk) => { + return { + ...chunk, + manufacturer: manual.manufacturer, + manufacturerCanonical: manual.manufacturerCanonical, + model: manual.model, + manualType: manual.manualType, + category: manual.category, + manualUrl: manual.manualUrl, + thumbnailUrl: manual.thumbnailUrl, + profiles: buildProfiles({ + labels: chunk.labels, + embeddingTier: chunk.embeddingTier, + overallQuality: chunk.overallQuality, + isRisky: chunk.isRisky, + }), + } + }) + + return { + manualId: manual.manualId, + title: manual.title, + manufacturer: manual.manufacturer, + manufacturerCanonical: manual.manufacturerCanonical, + model: manual.model, + manualType: manual.manualType, + category: manual.category, + manualUrl: manual.manualUrl, + thumbnailUrl: manual.thumbnailUrl, + sourceFilenames: Array.from(new Set(manual.sourceFilenames)).sort(), + sourceRecordCount: manual.sourceRecordCount, + metadataConfidence, + parseQuality, + duplicateRisk, + chunkCount: finalizedChunks.length, + highConfidenceChunkCount, + profiles, + embeddingTier, + flags: Array.from(manual.flagsSet).sort(), + chunks: finalizedChunks, + } +} + +function buildCorpusStats(args: { + catalogManuals: CatalogManual[] + structuredRecords: StructuredManualRecord[] + extractedRecords: ExtractedManualRecord[] + manuals: Array + chunks: ManualsQdrantChunk[] +}): ManualsQdrantCorpusStats { + const manualsByManufacturer: Record = {} + const chunksByLabel: Record = {} + const profileCounts: Record = { + public_safe: 0, + internal_tech: 0, + } + + for (const manual of args.manuals) { + manualsByManufacturer[manual.manufacturer] = + (manualsByManufacturer[manual.manufacturer] || 0) + 1 + } + + for (const chunk of args.chunks) { + for (const label of chunk.labels) { + chunksByLabel[label] = (chunksByLabel[label] || 0) + 1 + } + + for (const profile of chunk.profiles) { + profileCounts[profile] += 1 + } + } + + return { + catalogManuals: args.catalogManuals.length, + structuredRecords: args.structuredRecords.length, + extractedRecords: args.extractedRecords.length, + normalizedManuals: args.manuals.length, + chunkCount: args.chunks.length, + highConfidenceChunks: args.chunks.filter( + (chunk) => chunk.embeddingTier === "high_confidence" + ).length, + fallbackChunks: args.chunks.filter( + (chunk) => chunk.embeddingTier === "fallback" + ).length, + excludedChunks: args.chunks.filter( + (chunk) => chunk.embeddingTier === "exclude" + ).length, + manualsByManufacturer, + chunksByLabel, + profileCounts, + } +} + +function createChunk(args: { + manual: ManualAccumulator + text: string + pageNumber: number | null + sectionTitle: string | null + sourceFilename: string | null + sourceKind: ManualsQdrantChunk["sourceKind"] +}): ManualsQdrantChunk { + const cleanedText = cleanText(args.text) + const labels = deriveChunkLabels({ + text: cleanedText, + sectionTitle: args.sectionTitle, + sourceKind: args.sourceKind, + manualType: args.manual.manualType, + }) + const metadataConfidence = clamp( + args.manual.metadataConfidenceTotal / args.manual.metadataConfidenceSamples + ) + const textQuality = scoreTextQuality(cleanedText, labels) + const overallQuality = clamp(textQuality * 0.65 + metadataConfidence * 0.35) + const isRisky = + labels.includes("wiring") || + TECH_RISK_KEYWORDS.some((keyword) => + normalizeText(cleanedText).includes(normalizeText(keyword)) + ) + const embeddingTier = deriveEmbeddingTier({ + labels, + overallQuality, + sourceKind: args.sourceKind, + isRisky, + }) + + return { + chunkId: normalizeIdentifier( + `${args.manual.manualId} ${args.pageNumber ?? "na"} ${args.sectionTitle || ""} ${cleanedText.slice(0, 80)}` + ), + manualId: args.manual.manualId, + title: args.manual.title, + manufacturer: args.manual.manufacturer, + manufacturerCanonical: args.manual.manufacturerCanonical, + model: args.manual.model, + manualType: args.manual.manualType, + category: args.manual.category, + pageNumber: args.pageNumber, + sectionTitle: args.sectionTitle, + text: cleanedText, + sourceFilename: args.sourceFilename, + sourceKind: args.sourceKind, + labels, + manualUrl: args.manual.manualUrl, + thumbnailUrl: args.manual.thumbnailUrl, + metadataConfidence, + textQuality, + overallQuality, + embeddingTier, + profiles: buildProfiles({ + labels, + embeddingTier, + overallQuality, + isRisky, + }), + isRisky, + flags: buildChunkFlags(cleanedText, labels, overallQuality), + } +} + +function scoreChunkForQuery( + chunk: ManualsQdrantChunk, + queryTokens: string[], + queryLower: string +) { + const chunkText = normalizeText( + [ + chunk.title, + chunk.manufacturer, + chunk.model, + chunk.sectionTitle, + chunk.text, + ...chunk.labels, + ] + .filter(Boolean) + .join(" ") + ) + const chunkTokens = new Set(tokenize(chunkText)) + + let score = chunk.overallQuality * 10 + + for (const token of queryTokens) { + if (chunkTokens.has(token)) { + score += 3.5 + } else if (token.length >= 4 && chunkText.includes(token)) { + score += 1 + } + } + + if ( + (queryLower.includes("error") || + queryLower.includes("not ") || + queryLower.includes("wont") || + queryLower.includes("won t")) && + chunk.labels.includes("troubleshooting") + ) { + score += 10 + } + + if ( + (queryLower.includes("parts") || + queryLower.includes("part") || + queryLower.includes("coin") || + queryLower.includes("bill")) && + chunk.labels.includes("parts") + ) { + score += 7 + } + + if ( + (queryLower.includes("manual") || queryLower.includes("service")) && + chunk.labels.includes("service") + ) { + score += 5 + } + + if (queryLower.includes("wiring") && chunk.labels.includes("wiring")) { + score += 6 + } + + if (chunk.labels.includes("brochure")) { + score -= 5 + } + + if (chunk.labels.includes("toc") || chunk.labels.includes("flowchart")) { + score -= 8 + } + + return score +} + +function deriveChunkLabels(args: { + text: string + sectionTitle: string | null + sourceKind: ManualsQdrantChunk["sourceKind"] + manualType: string +}): ManualsQdrantChunkLabel[] { + const labels = new Set() + const haystack = normalizeText( + [args.sectionTitle, args.text, args.manualType].filter(Boolean).join(" ") + ) + + if ( + args.sourceKind === "troubleshooting" || + TROUBLESHOOTING_KEYWORDS.some((keyword) => + haystack.includes(normalizeText(keyword)) + ) + ) { + labels.add("troubleshooting") + labels.add("service") + } + + if ( + args.sourceKind === "parts_database" || + PARTS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword))) + ) { + labels.add("parts") + } + + if ( + args.manualType === "operator" || + OPERATOR_KEYWORDS.some((keyword) => + haystack.includes(normalizeText(keyword)) + ) + ) { + labels.add("operator") + } + + if ( + args.manualType === "service" || + haystack.includes("technical manual") || + haystack.includes("repair") + ) { + labels.add("service") + } + + if (SPECS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))) { + labels.add("specs") + } + + if ( + haystack.includes("table of contents") || + haystack.includes("list of figures") || + haystack.startsWith("contents") + ) { + labels.add("toc") + } + + if ( + haystack.includes("flow chart") || + haystack.includes("flowchart") || + looksLikeFlowchart(args.text) + ) { + labels.add("flowchart") + } + + if ( + haystack.includes("wiring") || + haystack.includes("electrical") || + haystack.includes("schematic") || + haystack.includes("voltage") + ) { + labels.add("wiring") + } + + if ( + args.manualType === "brochure" || + MARKETING_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword))) + ) { + labels.add("brochure") + } + + if (labels.size === 0) { + labels.add("general") + } + + return Array.from(labels).sort() +} + +function deriveEmbeddingTier(args: { + labels: ManualsQdrantChunkLabel[] + overallQuality: number + sourceKind: ManualsQdrantChunk["sourceKind"] + isRisky: boolean +}): ManualsEmbeddingTier { + if ( + args.overallQuality < 0.34 || + args.labels.includes("toc") || + args.labels.includes("flowchart") + ) { + return "exclude" + } + + if (args.labels.includes("brochure")) { + return args.overallQuality >= 0.62 ? "fallback" : "exclude" + } + + if ( + args.sourceKind === "ocr_page" && + args.overallQuality < 0.58 && + !args.labels.includes("troubleshooting") + ) { + return "fallback" + } + + if (args.isRisky && args.overallQuality < 0.7) { + return "fallback" + } + + return args.overallQuality >= 0.64 ? "high_confidence" : "fallback" +} + +function buildProfiles(args: { + labels: ManualsQdrantChunkLabel[] + embeddingTier: ManualsEmbeddingTier + overallQuality: number + isRisky: boolean +}): ManualsQdrantProfile[] { + if (args.embeddingTier === "exclude") { + return [] + } + + const profiles = new Set() + + if (!args.labels.includes("brochure") && !args.labels.includes("toc")) { + profiles.add("internal_tech") + } + + if ( + !args.isRisky && + args.overallQuality >= 0.56 && + !args.labels.includes("brochure") && + !args.labels.includes("flowchart") && + !args.labels.includes("toc") && + !args.labels.includes("wiring") + ) { + profiles.add("public_safe") + } + + return Array.from(profiles).sort() +} + +function buildChunkFlags( + text: string, + labels: ManualsQdrantChunkLabel[], + overallQuality: number +) { + const flags = new Set() + + if (overallQuality < 0.5) { + flags.add("low-quality") + } + + if (labels.includes("brochure")) { + flags.add("marketing-heavy") + } + + if (labels.includes("wiring")) { + flags.add("risky-technical") + } + + if (looksLikeOcrGarbage(text)) { + flags.add("ocr-noisy") + } + + return Array.from(flags).sort() +} + +function scoreTextQuality( + text: string, + labels: ManualsQdrantChunkLabel[] +) { + const alphaChars = text.replace(/[^a-z]/gi, "").length + const allChars = text.replace(/\s+/g, "").length || 1 + const alphaRatio = alphaChars / allChars + const tokenCount = tokenize(text).length + const uppercaseBursts = (text.match(/\b[A-Z]{4,}\b/g) || []).length + const sentenceLike = (text.match(/[.!?]/g) || []).length + + let score = + clamp(alphaRatio) * 0.35 + + clamp(tokenCount / 120) * 0.3 + + clamp(sentenceLike / 8) * 0.15 + + (looksLikeOcrGarbage(text) ? 0 : 0.2) + + if (labels.includes("troubleshooting")) { + score += 0.12 + } + + if (labels.includes("brochure")) { + score -= 0.1 + } + + if (uppercaseBursts > 18) { + score -= 0.12 + } + + return clamp(score) +} + +function detectManualType(value: string) { + const normalized = normalizeText(value) + + if ( + normalized.includes("brochure") || + normalized.includes("product notice") || + normalized.includes("warranty") + ) { + return "brochure" + } + + if (normalized.includes("parts")) { + return "parts" + } + + if (normalized.includes("operator") || normalized.includes("user guide")) { + return "operator" + } + + if ( + normalized.includes("service") || + normalized.includes("repair") || + normalized.includes("technical") + ) { + return "service" + } + + return "manual" +} + +function hasUsefulOcrText(record: ExtractedManualRecord) { + const words = (record.text?.pages || []).reduce( + (sum, page) => sum + (page.wordCount || 0), + 0 + ) + return words > 0 +} + +function looksLikeFlowchart(text: string) { + const normalized = text.replace(/\s+/g, " ").trim() + return ( + normalized.includes("* # #") || + normalized.includes("press selection number") || + normalized.split("\n").filter((line) => /^[*#A-Z0-9 ()/-]+$/.test(line.trim())) + .length > 8 + ) +} + +function looksLikeOcrGarbage(text: string) { + const normalized = text.replace(/\s+/g, " ").trim() + const weirdChars = (normalized.match(/[^\x20-\x7E\n\r\t]/g) || []).length + const singleLetterBursts = (normalized.match(/\b[A-Z](?:\s+[A-Z]){4,}\b/g) || []) + .length + + return weirdChars > 6 || singleLetterBursts > 0 +} + +function extractModel(value: string) { + const matches = normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9-]*\b/g) || [] + return matches[0] ? sanitizeModel(matches[0]) : null +} + +function sanitizeModel(value: string) { + const normalized = normalizeIdentifier(value).replace(/^unknown-?/, "") + return normalized || null +} + +function normalizeManufacturer(value: string | null | undefined): string { + const normalized = normalizeText(value || "") + + for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) { + if ( + canonical.toLowerCase() === normalized || + aliases.some((alias) => normalized.includes(normalizeText(alias))) + ) { + return canonical + } + } + + if (!normalized || isPlaceholderValue(normalized) || /^\d/.test(normalized)) { + return "Other" + } + + return toTitleCase(normalized) +} + +function humanizeManufacturer(value: string): string { + return normalizeManufacturer(value) +} + +function humanizeTitle(value: string) { + return value + .replace(/[-_]+/g, " ") + .replace(/\s+/g, " ") + .trim() +} + +function isPlaceholderValue(value: string) { + const normalized = normalizeText(value) + return ( + !normalized || + normalized === "unknown" || + normalized === "manual" || + /^\d+$/.test(normalized) + ) +} + +function cleanText(value: string) { + return value.replace(/\u00ad/g, "").replace(/\s+/g, " ").trim() +} + +function normalizeText(value: string) { + return value + .normalize("NFKD") + .replace(/[^a-zA-Z0-9]+/g, " ") + .toLowerCase() + .trim() +} + +function tokenize(value: string) { + return normalizeText(value) + .split(" ") + .map((token) => token.trim()) + .filter( + (token) => + token.length > 1 && + !STOPWORDS.has(token) && + !/^\d+$/.test(token) + ) +} + +function normalizeIdentifier(value: string) { + return normalizeText(stripExtension(value)).replace(/\s+/g, "-") +} + +function stripExtension(value: string) { + return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "") +} + +function getStructuredRecordMatchKey(record: StructuredManualRecord) { + return [ + normalizeIdentifier(record.manualId || ""), + normalizeManufacturer(record.manufacturer), + sanitizeModel(record.model || "") || "unknown", + detectManualType(record.manualType || ""), + ].join("::") +} + +function buildCanonicalManualId(args: { + catalogManual: Manual | null + manufacturer: string + model: string | null + manualType: string + filename: string +}) { + if (args.catalogManual) { + return normalizeIdentifier(args.catalogManual.path || args.catalogManual.filename) + } + + const normalizedManufacturer = normalizeManufacturer(args.manufacturer) + const hasReliableIdentity = + normalizedManufacturer !== "Other" || Boolean(args.model) + + if (hasReliableIdentity) { + return normalizeIdentifier( + `${normalizedManufacturer} ${args.model || "unknown"} ${args.manualType}` + ) + } + + return normalizeIdentifier(`${args.filename} ${args.manualType}`) +} + +function toTitleCase(value: string) { + return value + .split(" ") + .filter(Boolean) + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(" ") +} + +function clamp(value: number) { + return Math.max(0, Math.min(1, value)) +} + +async function readJsonFile(path: string) { + return JSON.parse(await readFile(path, "utf8")) as T +} diff --git a/lib/site-chat/prompt.ts b/lib/site-chat/prompt.ts index a69b6594..0cc8813e 100644 --- a/lib/site-chat/prompt.ts +++ b/lib/site-chat/prompt.ts @@ -2,9 +2,9 @@ import { businessConfig, serviceAreas } from "@/lib/seo-config" const SERVICE_AREA_LIST = serviceAreas.map((area) => area.city).join(", ") -export const SITE_CHAT_SYSTEM_PROMPT = `You are Jessica, a super friendly and casual text-chat assistant for ${businessConfig.legalName} in Utah. Sound like a chill local friend who is genuinely trying to help. Use warm, natural phrases like "Hey," "Gotcha," "No worries," "That helps a ton," and "Just curious," when they fit. Never sound robotic, salesy, or overly formal. +const SITE_CHAT_SYSTEM_PROMPT_BASE = `You are Jessica, a super friendly and casual text-chat assistant for ${businessConfig.legalName} in Utah. Sound like a chill local friend who is genuinely trying to help. Use warm, natural phrases like "Hey," "Gotcha," "No worries," "That helps a ton," and "Just curious," when they fit. Never sound robotic, salesy, or overly formal. -Use this exact knowledge base and do not go beyond it: +Use only the knowledge provided in this system prompt plus any manual knowledge context supplied later in the conversation. Do not go beyond that information. - Free vending placement is only for qualifying businesses. Rocky Mountain Vending installs, stocks, maintains, and repairs those machines at no cost to the business. - Repairs and maintenance are for machines the customer owns. - Moving requests can be for a vending machine or a safe, and they follow the same intake flow as repairs. @@ -22,12 +22,23 @@ Conversation rules: - For repairs or moving, start by asking what the machine looks like, what brand is on the front, or what they already know. If the move is involved, clarify whether it is for a vending machine or a safe. Later, direct them to text photos or videos to ${businessConfig.publicSmsNumber} or use the contact form so the team can diagnose remotely first. - For free placement, first confirm it is for a business. Then ask about the business type, then the approximate number of people, then the location over separate turns. - For sales, first ask what kind of machine or features they are thinking about. Ask about new or used and budget later, not all at once. -- For manuals or parts, ask what they remember about the machine or part instead of only asking for a model number. +- For manuals, parts, or troubleshooting, ask what they remember about the machine or part instead of only asking for a model number. +- When manual knowledge context is present, use only that retrieved context for manuals, parts, and troubleshooting replies. +- For manuals, parts, or troubleshooting, stay limited to easy identification, likely issue category, and basic safe checks pulled from the retrieved context. +- Cite the manual naturally when useful, like mentioning the manual name and page number in plain language. +- If manual context is missing or low-confidence, do not guess. Ask for the brand, model sticker, or a clear photo/video that they can text to ${businessConfig.publicSmsNumber}. +- Do not provide step-by-step repair procedures, wiring guidance, voltage guidance, bypasses, or risky technical instructions. - If the visitor asks about a place that appears on the current website, treat it as inside the service area unless a human needs to confirm edge-case coverage. Safety rules: - Never mention, quote, or hint at prices, service call fees, repair rates, hourly rates, parts costs, or internal policies. -- If the visitor asks about pricing or cost, say: "Our complete vending service, including installation, stocking, and maintenance, is provided at no cost to qualifying businesses. I can get a few details so our team can schedule a quick call with you." +- If the visitor asks about pricing or cost, say: "Our complete vending service, including installation, stocking, and maintenance, is provided at no cost to qualifying businesses. I can get a few details so our team can schedule a quick call with you." - Do not invent timelines, guarantees, inventory, contract terms, or legal details. - If something needs confirmation, say a team member can confirm it. ` + +export function buildSiteChatSystemPrompt() { + return SITE_CHAT_SYSTEM_PROMPT_BASE +} + +export const SITE_CHAT_SYSTEM_PROMPT = buildSiteChatSystemPrompt() diff --git a/scripts/build-manuals-qdrant-corpus.ts b/scripts/build-manuals-qdrant-corpus.ts new file mode 100644 index 00000000..a10f0c1a --- /dev/null +++ b/scripts/build-manuals-qdrant-corpus.ts @@ -0,0 +1,37 @@ +import { join } from "node:path" +import { parseArgs } from "node:util" +import { writeManualsQdrantArtifacts } from "@/lib/manuals-qdrant-corpus" + +const { values } = parseArgs({ + args: process.argv.slice(2), + options: { + "output-dir": { + type: "string", + }, + }, +}) + +const defaultOutputDir = join(process.cwd(), "output", "manuals-qdrant") + +async function main() { + const result = await writeManualsQdrantArtifacts({ + outputDir: values["output-dir"] || defaultOutputDir, + }) + + const summary = { + outputDir: result.outputDir, + manuals: result.corpus.manuals.length, + chunks: result.corpus.chunks.length, + highConfidenceChunks: result.corpus.stats.highConfidenceChunks, + fallbackChunks: result.corpus.stats.fallbackChunks, + excludedChunks: result.corpus.stats.excludedChunks, + evaluation: result.evaluation.summary, + } + + console.log(JSON.stringify(summary, null, 2)) +} + +main().catch((error) => { + console.error(error) + process.exitCode = 1 +}) diff --git a/scripts/evaluate-manuals-qdrant-corpus.ts b/scripts/evaluate-manuals-qdrant-corpus.ts new file mode 100644 index 00000000..f034947c --- /dev/null +++ b/scripts/evaluate-manuals-qdrant-corpus.ts @@ -0,0 +1,33 @@ +import { + buildManualsQdrantCorpus, + evaluateManualsQdrantCorpus, +} from "@/lib/manuals-qdrant-corpus" + +async function main() { + const corpus = await buildManualsQdrantCorpus() + const evaluation = evaluateManualsQdrantCorpus(corpus) + + const failingCases = evaluation.cases.filter( + (entry) => + entry.passedTop3Manufacturer === false || + !entry.passedTop5Label || + !entry.passedDisallowedCheck + ) + + console.log( + JSON.stringify( + { + generatedAt: corpus.generatedAt, + summary: evaluation.summary, + failingCases, + }, + null, + 2 + ) + ) +} + +main().catch((error) => { + console.error(error) + process.exitCode = 1 +})