import { existsSync } from "node:fs" import { readFile, readdir } from "node:fs/promises" import { basename, join } from "node:path" import { listConvexManuals } from "@/lib/convex-service" import { scanManuals } from "@/lib/manuals" import { getManualsDataRoot } from "@/lib/manuals-paths" import { buildManualAssetUrl, buildThumbnailAssetUrl, } from "@/lib/manuals-storage" import type { Manual } from "@/lib/manuals-types" import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types" const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"] const STRUCTURED_MANUALS_DIR = [ ...MANUALS_OPTIMIZED_ROOT, "training_data", "structured", ] const EXTRACTED_CONTENT_FILE = [ ...MANUALS_OPTIMIZED_ROOT, "extracted_content", "manuals_extracted_content.json", ] const DEFAULT_MANUALS_PLATFORM_TENANT_ID = "rocky-mountain-vending" const STOPWORDS = new Set([ "a", "an", "and", "are", "at", "be", "by", "for", "from", "help", "how", "i", "in", "is", "it", "machine", "manual", "me", "my", "of", "on", "or", "our", "please", "service", "that", "the", "this", "to", "up", "with", ]) const RISKY_MANUAL_KEYWORDS = [ "wiring", "diagram", "voltage", "compressor", "refrigerant", "bypass", "jumper", "board level", "schematic", "electrical", "rewire", "disassemble", "tear down", ] const MANUAL_QUERY_HINTS = [ "manual", "model", "serial", "error", "code", "parts", "part", "troubleshoot", "troubleshooting", "not cooling", "not vending", "coin", "bill acceptor", "bill", "coin mech", "validator", "jam", "stuck", "door", "display", "keypad", "compressor", "motor", "sensor", ] const MANUFACTURER_ALIASES: Record = { "automatic products": ["automatic products", "automatic-products", "ap"], "coinco": ["coinco"], "crane": [ "crane", "national", "national vendors", "merchant", "merchant series", "shopper", ], "dixie narco": [ "dixie", "narco", "dixie narco", "dixie-narco", "dn", "bevmax", ], "gpl": ["gpl", "general products"], "mei mars": ["mei", "mars", "mei mars", "bill validator"], "royal vendors": ["royal", "royal vendors", "royal vendor", "rvv"], "rowe": ["rowe"], "seaga": ["seaga"], "usi": ["usi", "u select it", "u-select-it", "uselectit"], "vendo": ["vendo", "sanden"], } export type ManualCandidate = { manualId: string filename: string manufacturer: string category: string manualUrl: string | null thumbnailUrl: string | null score: number confidence: number } export type ManualKnowledgeChunk = { manualId: string filename: string manufacturer: string model: string | null manualType: string pageNumber: number | null sectionTitle: string | null text: string manualUrl: string | null thumbnailUrl: string | null sourceConfidence: number matchScore: number citation: string } export type ManualKnowledgeFilters = { manufacturer?: string | null manualId?: string | null model?: string | null } export type RetrieveManualContextResult = { query: string bestManual: ManualCandidate | null manualCandidates: ManualCandidate[] topChunks: ManualKnowledgeChunk[] needsClarification: boolean isRisky: boolean } export type ManualCitationContext = { manual: ManualCandidate | null citations: ManualKnowledgeChunk[] } export type ManualRetrievalSummary = { ran: boolean query: string bestManualId: string | null manualCandidateIds: string[] topChunkCitations: string[] needsClarification: boolean | null isRisky: boolean | null error: string | null } export interface ManualKnowledgeProvider { findManualCandidates(query: string): Promise retrieveManualContext( query: string, filters?: ManualKnowledgeFilters ): Promise getManualCitationContext( manualId: string, pageNumber?: number ): Promise } type StructuredSection = { title?: string pageNumber?: number text?: string } type StructuredTroubleshooting = { problem?: string solution?: string pageNumber?: number } type StructuredPart = { partNumber?: string description?: string pageNumber?: number } type StructuredManualRecord = { manualId?: string manufacturer?: string model?: string manualType?: string content?: { sections?: StructuredSection[] troubleshooting?: StructuredTroubleshooting[] partsDatabase?: StructuredPart[] } } type ExtractedPage = { pageNumber?: number text?: string wordCount?: number } type ExtractedPartList = { pageNumber?: number parts?: StructuredPart[] } type ExtractedManualRecord = { filename?: string sections?: StructuredSection[] partsLists?: ExtractedPartList[] text?: { fullText?: string pages?: ExtractedPage[] } } type SharedPlatformManualRecord = { manualId: string filename: string manufacturer: string category: string | null model: string | null manualType: string sourcePath?: string | null manualUrl?: string | null thumbnailUrl?: string | null sourceFilenames?: string[] } type SharedPlatformChunkRecord = { manualId: string title?: string manufacturer: string model: string | null manualType: string pageNumber: number | null sectionTitle: string | null text: string manualUrl?: string | null thumbnailUrl?: string | null metadataConfidence?: number overallQuality?: number } type PreparedQuery = { normalized: string tokens: string[] modelTokens: string[] manufacturerFilters: string[] manualIdFilter: string | null } type ManualInternal = { manualId: string filename: string manufacturer: string category: string model: string | null manualType: string manualUrl: string | null thumbnailUrl: string | null searchText: string tokenSet: Set normalizedManufacturer: string } type ChunkInternal = Omit & { searchText: string tokenSet: Set matchScore: number } type ManualKnowledgeArtifact = { manuals: ManualInternal[] manualById: Map chunks: ChunkInternal[] chunksByManualId: Map } type ResettableManualKnowledgeProvider = ManualKnowledgeProvider & { resetCache(): void } let defaultProvider: ManualKnowledgeProvider | null = null export function getManualKnowledgeProvider(): ManualKnowledgeProvider { if (!defaultProvider) { defaultProvider = new LocalManualKnowledgeProvider() } return defaultProvider } export async function findManualCandidates(query: string) { return await getManualKnowledgeProvider().findManualCandidates(query) } export async function retrieveManualContext( query: string, filters?: ManualKnowledgeFilters ) { return await getManualKnowledgeProvider().retrieveManualContext(query, filters) } export async function getManualCitationContext( manualId: string, pageNumber?: number ) { return await getManualKnowledgeProvider().getManualCitationContext( manualId, pageNumber ) } export function resetManualKnowledgeCache() { if ( defaultProvider && "resetCache" in defaultProvider && typeof (defaultProvider as ResettableManualKnowledgeProvider).resetCache === "function" ) { ;(defaultProvider as ResettableManualKnowledgeProvider).resetCache() } } export function summarizeManualRetrieval(args: { ran: boolean query: string result?: RetrieveManualContextResult | null error?: unknown }): ManualRetrievalSummary { const error = args.error instanceof Error ? args.error.message : typeof args.error === "string" ? args.error : null if (!args.ran) { return { ran: false, query: args.query, bestManualId: null, manualCandidateIds: [], topChunkCitations: [], needsClarification: null, isRisky: null, error, } } return { ran: true, query: args.query, bestManualId: args.result?.bestManual?.manualId || null, manualCandidateIds: (args.result?.manualCandidates || []) .slice(0, 4) .map((candidate) => candidate.manualId), topChunkCitations: (args.result?.topChunks || []) .slice(0, 4) .map((chunk) => chunk.citation), needsClarification: args.result?.needsClarification ?? null, isRisky: args.result?.isRisky ?? null, error, } } export function shouldUseManualKnowledgeForChat( intent: string | null | undefined, query: string ) { const normalizedIntent = normalizeText(intent || "") const normalizedQuery = normalizeText(query) const hasManualHints = looksLikeModelTokenQuery(normalizedQuery) || MANUAL_QUERY_HINTS.some((hint) => normalizedQuery.includes(hint)) if (!normalizedQuery) { return false } if ( normalizedIntent.includes("manual") || normalizedIntent.includes("repair") || normalizedIntent.includes("parts") ) { return hasManualHints } if (looksLikeModelTokenQuery(normalizedQuery)) { return true } return hasManualHints } export function formatManualContextForPrompt( result: RetrieveManualContextResult ) { const lines = [ "Manual knowledge context:", result.topChunks.length > 0 ? "- Use only the excerpts below for any manuals, parts, or troubleshooting reply." : "- No reliable manual excerpt was found.", result.isRisky ? "- The question looks technical or risky. Stay high-level and safe, and do not provide procedural repair steps." : "- Stay limited to simple identification, likely issue category, and very basic safe checks.", ] if (result.manualCandidates.length > 0) { lines.push("Likely manual candidates:") for (const candidate of result.manualCandidates.slice(0, 3)) { lines.push( `- ${buildManualLabel(candidate.filename, candidate.manufacturer)}` ) } } if (result.topChunks.length === 0) { lines.push( "- Ask for the brand on the front, model sticker, or a clear photo/video, and offer texting it in for the team to review." ) return lines.join("\n") } if (result.needsClarification) { lines.push( "- Confidence is limited. Ask for the brand on the front, the model sticker, or a clear photo/video before sounding certain." ) } lines.push("Grounded excerpts:") for (const chunk of result.topChunks.slice(0, 3)) { lines.push(`- ${chunk.citation}: ${truncateText(chunk.text, 420)}`) } return lines.join("\n") } class LocalManualKnowledgeProvider implements ManualKnowledgeProvider { private artifactPromise: Promise | null = null async findManualCandidates(query: string) { const artifact = await this.getArtifact() const prepared = prepareQuery(query) return artifact.manuals .map((manual) => buildCandidate(manual, scoreManual(manual, prepared))) .filter((candidate) => candidate.score > 0) .sort(compareByScore) .slice(0, 6) } async retrieveManualContext( query: string, filters?: ManualKnowledgeFilters ): Promise { const artifact = await this.getArtifact() const prepared = prepareQuery(query, filters) const manualCandidates = artifact.manuals .map((manual) => { const score = scoreManual(manual, prepared) return { manual, score } }) .filter((entry) => entry.score > 0) .sort((left, right) => right.score - left.score) const candidateIds = new Set( manualCandidates.slice(0, 10).map((entry) => entry.manual.manualId) ) const topChunks = artifact.chunks .map((chunk) => { const manualScore = manualCandidates.find((entry) => entry.manual.manualId === chunk.manualId) ?.score ?? 0 const score = scoreChunk( chunk, prepared, candidateIds.size > 0 && candidateIds.has(chunk.manualId) ? manualScore : manualScore * 0.35 ) return { chunk, score } }) .filter((entry) => entry.score > 10) .sort((left, right) => right.score - left.score) .slice(0, 5) .map(({ chunk, score }) => ({ ...stripChunkForResponse(chunk), matchScore: score, })) const needsClarification = topChunks.length === 0 || topChunks[0].matchScore < 24 || (manualCandidates[0] ? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score) .confidence < 0.48 : true) const bestManualFromChunks = topChunks.length > 0 ? manualCandidates.find( (entry) => entry.manual.manualId === topChunks[0].manualId ) : null const bestManual = bestManualFromChunks ? buildCandidate(bestManualFromChunks.manual, bestManualFromChunks.score) : manualCandidates.length > 0 ? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score) : null return { query, bestManual, manualCandidates: manualCandidates .slice(0, 4) .map(({ manual, score }) => buildCandidate(manual, score)), topChunks, needsClarification, isRisky: isRiskyManualQuery(query), } } async getManualCitationContext(manualId: string, pageNumber?: number) { const artifact = await this.getArtifact() const manual = artifact.manualById.get(manualId) || null const chunks = artifact.chunksByManualId.get(manualId) || [] const citations = chunks .filter((chunk) => typeof pageNumber === "number" ? chunk.pageNumber === pageNumber : true ) .slice(0, 5) .map(stripChunkForResponse) return { manual: manual ? buildCandidate(manual, 1) : null, citations, } } private async getArtifact() { if (!this.artifactPromise) { this.artifactPromise = buildArtifact() } return await this.artifactPromise } resetCache() { this.artifactPromise = null } } async function buildArtifact(): Promise { const sharedArtifact = await loadSharedPlatformArtifact() if (sharedArtifact) { return sharedArtifact } const manuals = await loadManualCatalog() const manualById = new Map() for (const manual of manuals) { manualById.set(manual.manualId, manual) } const chunks: ChunkInternal[] = [] const chunksByManualId = new Map() const extractedByFilename = await loadExtractedContentByFilename() const structuredRecords = await loadStructuredManualRecords() const manualsWithStructuredChunks = new Set() for (const record of structuredRecords) { const matchedManual = matchStructuredRecordToManual(record, manuals) const manual = ensureManualEntry({ manualById, manuals, matchedManual, fallbackManualId: normalizeIdentifier(record.manualId || "unknown-manual"), filename: matchedManual?.filename || `${record.manualId || "manual"}.pdf`, manufacturer: matchedManual?.manufacturer || humanizeToken(record.manufacturer || "Unknown"), category: matchedManual?.category || record.manualType || "manual", model: normalizeNullable(record.model), manualType: normalizeManualType(record.manualType), manualUrl: matchedManual?.manualUrl || null, thumbnailUrl: matchedManual?.thumbnailUrl || null, }) const structuredChunks = buildStructuredChunks(record, manual) if (structuredChunks.length > 0) { manualsWithStructuredChunks.add(manual.manualId) addChunks(chunks, chunksByManualId, structuredChunks) } } for (const extracted of extractedByFilename.values()) { const manual = matchExtractedRecordToManual(extracted, manuals) const sourceManual = manual ? ensureManualEntry({ manualById, manuals, matchedManual: manual, fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "")), filename: manual.filename, manufacturer: manual.manufacturer, category: manual.category, model: manual.model, manualType: manual.manualType, manualUrl: manual.manualUrl, thumbnailUrl: manual.thumbnailUrl, }) : ensureManualEntry({ manualById, manuals, matchedManual: null, fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "manual")), filename: extracted.filename || "manual.pdf", manufacturer: "Unknown", category: "manual", model: null, manualType: "manual", manualUrl: null, thumbnailUrl: null, }) if (manualsWithStructuredChunks.has(sourceManual.manualId)) { continue } const fallbackChunks = buildExtractedFallbackChunks(extracted, sourceManual) if (fallbackChunks.length > 0) { addChunks(chunks, chunksByManualId, fallbackChunks) } } return { manuals, manualById, chunks, chunksByManualId, } } async function loadSharedPlatformArtifact(): Promise { const platformRoot = resolveManualsPlatformRoot() if (!platformRoot) { return null } const tenantId = getManualsPlatformTenantId() const tenantDir = join(platformRoot, "output", "tenants", tenantId) const manualsPath = join(tenantDir, "manuals.json") const chunksPath = join(tenantDir, "chunks.json") if (!existsSync(manualsPath) || !existsSync(chunksPath)) { return null } const [manualRecords, chunkRecords] = await Promise.all([ readJsonFile(manualsPath), readJsonFile(chunksPath), ]) const manuals: ManualInternal[] = manualRecords.map((manual) => { const searchTerms = [ manual.filename, manual.sourcePath, manual.manufacturer, manual.category, manual.model, manual.manualType, ...(manual.sourceFilenames || []), ] .filter(Boolean) .join(" ") return { manualId: manual.manualId, filename: manual.filename, manufacturer: manual.manufacturer, category: manual.category || manual.manualType || "manual", model: manual.model || null, manualType: normalizeManualType(manual.manualType), manualUrl: toSiteManualUrl(manual.manualUrl || manual.sourcePath || null), thumbnailUrl: toSiteThumbnailUrl(manual.thumbnailUrl || null), searchText: normalizeText(searchTerms), tokenSet: new Set(tokenize(searchTerms)), normalizedManufacturer: canonicalManufacturer(manual.manufacturer), } }) const manualById = new Map() for (const manual of manuals) { manualById.set(manual.manualId, manual) } const chunks: ChunkInternal[] = chunkRecords.map((chunk) => { const manual = manualById.get(chunk.manualId) || null const filename = manual?.filename || humanizeToken(chunk.title || chunk.manualId) const manufacturer = manual?.manufacturer || chunk.manufacturer const manualUrl = manual?.manualUrl || toSiteManualUrl(chunk.manualUrl || null) const thumbnailUrl = manual?.thumbnailUrl || toSiteThumbnailUrl(chunk.thumbnailUrl || null) const searchText = normalizeText( [ filename, manufacturer, chunk.model, chunk.sectionTitle, chunk.text, chunk.manualType, ] .filter(Boolean) .join(" ") ) return { manualId: chunk.manualId, filename, manufacturer, model: chunk.model, manualType: normalizeManualType(chunk.manualType), pageNumber: chunk.pageNumber, sectionTitle: chunk.sectionTitle, text: chunk.text, manualUrl, thumbnailUrl, sourceConfidence: clamp( chunk.metadataConfidence ?? chunk.overallQuality ?? 0.76 ), matchScore: 0, citation: buildCitation(filename, manufacturer, chunk.pageNumber), searchText, tokenSet: new Set(tokenize(searchText)), } }) const chunksByManualId = new Map() for (const chunk of chunks) { const existing = chunksByManualId.get(chunk.manualId) || [] existing.push(chunk) chunksByManualId.set(chunk.manualId, existing) } return { manuals, manualById, chunks, chunksByManualId, } } function resolveManualsPlatformRoot() { const candidates = [ process.env.MANUALS_PLATFORM_ROOT, join(process.cwd(), "..", "manuals-platform"), join(process.cwd(), "manuals-platform"), ].filter(Boolean) as string[] for (const candidate of candidates) { if (existsSync(candidate)) { return candidate } } return null } function getManualsPlatformTenantId() { const value = process.env.MANUALS_PLATFORM_TENANT_ID || process.env.SITE_MANUALS_TENANT_ID || DEFAULT_MANUALS_PLATFORM_TENANT_ID return value.trim() || DEFAULT_MANUALS_PLATFORM_TENANT_ID } function toSiteManualUrl(value: string | null) { if (!value) { return null } if (/^https?:\/\//i.test(value)) { return value } const relativePath = value.replace(/^manuals\//i, "").replace(/^\/manuals\//i, "") return buildManualAssetUrl(relativePath) } function toSiteThumbnailUrl(value: string | null) { if (!value) { return null } if (/^https?:\/\//i.test(value)) { return value } const relativePath = value .replace(/^thumbnails\//i, "") .replace(/^\/thumbnails\//i, "") return buildThumbnailAssetUrl(relativePath) } async function loadManualCatalog() { const convexManuals = await listConvexManuals().catch(() => [] as Manual[]) const filesystemManuals = convexManuals.length > 0 ? [] : await scanManuals().catch(() => [] as Manual[]) const sourceManuals = convexManuals.length > 0 ? convexManuals : filesystemManuals const deduped = new Map() for (const manual of sourceManuals) { const manualId = normalizeIdentifier(manual.path || manual.filename) const manufacturer = manual.manufacturer || "Unknown" const filename = manual.filename || basename(manual.path) const category = manual.category || "manual" const model = guessModelFromManual(manual) const searchTerms = [ filename, stripExtension(filename), manual.path, manufacturer, category, ...(manual.searchTerms || []), ...(manual.commonNames || []), ...aliasesForManufacturer(manufacturer), model || "", ] .filter(Boolean) .join(" ") deduped.set(manualId, { manualId, filename, manufacturer, category, model, manualType: normalizeManualType(category), manualUrl: getManualUrl(manual), thumbnailUrl: getThumbnailUrl(manual), searchText: normalizeText(searchTerms), tokenSet: new Set(tokenize(searchTerms)), normalizedManufacturer: canonicalManufacturer(manufacturer), }) } return Array.from(deduped.values()) } async function loadStructuredManualRecords() { const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR) const entries = await readdir(directory, { withFileTypes: true }) const files = entries .filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json")) .map((entry) => entry.name) return await Promise.all( files.map(async (filename) => { const path = join(directory, filename) const parsed = await readJsonFile(path) return { ...parsed, manualId: parsed.manualId || stripExtension(filename), } }) ) } async function loadExtractedContentByFilename() { const path = join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE) const records = await readJsonFile(path) const map = new Map() for (const record of records) { if (!record.filename) { continue } map.set(normalizeIdentifier(record.filename), record) } return map } function matchStructuredRecordToManual( record: StructuredManualRecord, manuals: ManualInternal[] ) { const query = [ record.manualId, record.manufacturer, record.model, record.manualType, ] .filter(Boolean) .join(" ") const prepared = prepareQuery(query, { manufacturer: record.manufacturer, model: record.model, }) const best = manuals .map((manual) => ({ manual, score: scoreManual(manual, prepared), })) .sort((left, right) => right.score - left.score)[0] return best && best.score >= 12 ? best.manual : null } function matchExtractedRecordToManual( record: ExtractedManualRecord, manuals: ManualInternal[] ) { const filename = record.filename || "" const normalizedFilename = normalizeIdentifier(filename) const exactMatch = manuals.find( (manual) => normalizeIdentifier(manual.filename) === normalizedFilename || normalizeIdentifier(manual.manualId) === normalizedFilename ) if (exactMatch) { return exactMatch } const query = stripExtension(filename) const prepared = prepareQuery(query) const best = manuals .map((manual) => ({ manual, score: scoreManual(manual, prepared), })) .sort((left, right) => right.score - left.score)[0] return best && best.score >= 10 ? best.manual : null } function buildStructuredChunks( record: StructuredManualRecord, manual: ManualInternal ) { const chunks: ChunkInternal[] = [] const sections = record.content?.sections || [] const troubleshooting = record.content?.troubleshooting || [] const parts = record.content?.partsDatabase || [] for (const section of sections) { const text = cleanChunkText(section.text || "") if (!isUsefulChunkText(text)) { continue } chunks.push( makeChunk({ manual, pageNumber: section.pageNumber ?? null, sectionTitle: cleanSectionTitle(section.title), text, sourceConfidence: 0.92, }) ) } for (const item of troubleshooting) { const problem = cleanChunkText(item.problem || "") const solution = cleanChunkText(item.solution || "") const text = cleanChunkText( [problem ? `Problem: ${problem}` : "", solution ? `Likely cause or solution: ${solution}` : ""] .filter(Boolean) .join("\n") ) if (!isUsefulChunkText(text)) { continue } chunks.push( makeChunk({ manual, pageNumber: item.pageNumber ?? null, sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting", text, sourceConfidence: 0.95, }) ) } const partChunks = buildPartChunks(parts, manual, 0.8) addChunks(chunks, new Map(), partChunks) return chunks } function buildExtractedFallbackChunks( record: ExtractedManualRecord, manual: ManualInternal ) { const chunks: ChunkInternal[] = [] const pages = record.text?.pages || [] for (const page of pages) { const text = cleanChunkText(page.text || "") if (!isUsefulChunkText(text)) { continue } chunks.push( makeChunk({ manual, pageNumber: page.pageNumber ?? null, sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "Manual page", text, sourceConfidence: 0.72, }) ) } const partChunks = buildExtractedPartListChunks(record.partsLists || [], manual) addChunks(chunks, new Map(), partChunks) return chunks } function buildPartChunks( parts: StructuredPart[], manual: ManualInternal, sourceConfidence: number ) { const partsByPage = new Map() for (const part of parts) { const partNumber = cleanChunkText(part.partNumber || "") const description = cleanChunkText(part.description || "") if (partNumber.length < 2 && description.length < 6) { continue } const pageNumber = part.pageNumber ?? 0 const summary = description ? `Part ${partNumber}: ${description}` : `Part ${partNumber}` const bucket = partsByPage.get(pageNumber) || [] if (bucket.length < 8) { bucket.push(summary) partsByPage.set(pageNumber, bucket) } } return Array.from(partsByPage.entries()).map(([pageNumber, summaries]) => makeChunk({ manual, pageNumber: pageNumber || null, sectionTitle: "Parts reference", text: summaries.join("\n"), sourceConfidence, }) ) } function buildExtractedPartListChunks( partLists: ExtractedPartList[], manual: ManualInternal ) { const flattened: StructuredPart[] = [] for (const partList of partLists) { for (const part of partList.parts || []) { flattened.push({ ...part, pageNumber: part.pageNumber ?? partList.pageNumber, }) } } return buildPartChunks(flattened, manual, 0.76) } function addChunks( target: ChunkInternal[], chunksByManualId: Map, chunks: ChunkInternal[] ) { for (const chunk of chunks) { target.push(chunk) const existing = chunksByManualId.get(chunk.manualId) || [] existing.push(chunk) chunksByManualId.set(chunk.manualId, existing) } } function ensureManualEntry(args: { manualById: Map manuals: ManualInternal[] matchedManual: ManualInternal | null fallbackManualId: string filename: string manufacturer: string category: string model: string | null manualType: string manualUrl: string | null thumbnailUrl: string | null }) { if (args.matchedManual) { return args.matchedManual } const manualId = normalizeIdentifier(args.fallbackManualId) const existing = args.manualById.get(manualId) if (existing) { return existing } const fallback: ManualInternal = { manualId, filename: args.filename, manufacturer: args.manufacturer, category: args.category, model: args.model, manualType: args.manualType, manualUrl: args.manualUrl, thumbnailUrl: args.thumbnailUrl, searchText: normalizeText( [ args.filename, args.manufacturer, args.category, args.model, args.manualType, ] .filter(Boolean) .join(" ") ), tokenSet: new Set( tokenize( [ args.filename, args.manufacturer, args.category, args.model, args.manualType, ] .filter(Boolean) .join(" ") ) ), normalizedManufacturer: canonicalManufacturer(args.manufacturer), } args.manuals.push(fallback) args.manualById.set(fallback.manualId, fallback) return fallback } function prepareQuery(query: string, filters?: ManualKnowledgeFilters): PreparedQuery { const normalized = normalizeText(query) const tokenSet = new Set(tokenize(normalized)) const modelTokens = extractModelTokens(normalized) const manufacturerFilters = new Set() for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) { for (const alias of aliases) { if (normalized.includes(normalizeText(alias))) { manufacturerFilters.add(canonical) } } } if (filters?.manufacturer) { manufacturerFilters.add(canonicalManufacturer(filters.manufacturer)) } if (filters?.model) { tokenSet.add(normalizeText(filters.model)) } return { normalized, tokens: Array.from(tokenSet), modelTokens, manufacturerFilters: Array.from(manufacturerFilters), manualIdFilter: filters?.manualId ? normalizeIdentifier(filters.manualId) : null, } } function scoreManual(manual: ManualInternal, query: PreparedQuery) { if (!query.normalized) { return 0 } let score = 0 if (query.manualIdFilter) { score += query.manualIdFilter === manual.manualId ? 28 : -12 } if (query.manufacturerFilters.length > 0) { score += query.manufacturerFilters.includes(manual.normalizedManufacturer) ? 14 : -6 } if (manual.searchText.includes(query.normalized)) { score += 24 } for (const token of query.tokens) { if (!token) { continue } if (manual.tokenSet.has(token)) { score += query.modelTokens.includes(token) ? 8 : 4 continue } if (token.length >= 4 && manual.searchText.includes(token)) { score += 1.5 } } for (const token of query.modelTokens) { if (manual.searchText.includes(token)) { score += 8 } } if ( query.normalized.includes("error") || query.normalized.includes("not ") || query.normalized.includes("coin") || query.normalized.includes("bill") ) { if ( manual.manualType.includes("repair") || manual.manualType.includes("service") || manual.manualType.includes("parts") ) { score += 6 } } return score } function scoreChunk( chunk: ChunkInternal, query: PreparedQuery, manualScore: number ) { if (!query.normalized) { return manualScore } let score = manualScore + chunk.sourceConfidence * 8 if (query.manufacturerFilters.length > 0) { score += query.manufacturerFilters.includes(canonicalManufacturer(chunk.manufacturer)) ? 6 : -4 } if (chunk.searchText.includes(query.normalized)) { score += 18 } for (const token of query.tokens) { if (!token) { continue } if (chunk.tokenSet.has(token)) { score += query.modelTokens.includes(token) ? 7 : 3 continue } if (token.length >= 5 && chunk.searchText.includes(token)) { score += 1 } } if ( query.normalized.includes("parts") || query.normalized.includes("part") || query.normalized.includes("bill") || query.normalized.includes("coin") ) { if ((chunk.sectionTitle || "").toLowerCase().includes("parts")) { score += 6 } } if ( query.normalized.includes("error") || query.normalized.includes("not ") || query.normalized.includes("won t") || query.normalized.includes("wont") ) { if ((chunk.sectionTitle || "").toLowerCase().includes("troubleshooting")) { score += 5 } } return score } function buildCandidate(manual: ManualInternal, score: number): ManualCandidate { return { manualId: manual.manualId, filename: manual.filename, manufacturer: manual.manufacturer, category: manual.category, manualUrl: manual.manualUrl, thumbnailUrl: manual.thumbnailUrl, score, confidence: clamp(score / 38), } } function makeChunk(args: { manual: ManualInternal pageNumber: number | null sectionTitle: string | null text: string sourceConfidence: number }) { const searchText = normalizeText( [ args.manual.filename, args.manual.manufacturer, args.manual.model, args.sectionTitle, args.text, ] .filter(Boolean) .join(" ") ) return { manualId: args.manual.manualId, filename: args.manual.filename, manufacturer: args.manual.manufacturer, model: args.manual.model, manualType: args.manual.manualType, pageNumber: args.pageNumber, sectionTitle: args.sectionTitle, text: args.text, manualUrl: args.manual.manualUrl, thumbnailUrl: args.manual.thumbnailUrl, sourceConfidence: args.sourceConfidence, matchScore: 0, citation: buildCitation( args.manual.filename, args.manual.manufacturer, args.pageNumber ), searchText, tokenSet: new Set(tokenize(searchText)), } satisfies ChunkInternal } function stripChunkForResponse(chunk: ChunkInternal): ManualKnowledgeChunk { return { manualId: chunk.manualId, filename: chunk.filename, manufacturer: chunk.manufacturer, model: chunk.model, manualType: chunk.manualType, pageNumber: chunk.pageNumber, sectionTitle: chunk.sectionTitle, text: chunk.text, manualUrl: chunk.manualUrl, thumbnailUrl: chunk.thumbnailUrl, sourceConfidence: chunk.sourceConfidence, matchScore: chunk.matchScore, citation: chunk.citation, } } function cleanChunkText(value: string) { return value .replace(/\u00ad/g, "") .replace(/\s+/g, " ") .trim() } function cleanSectionTitle(value?: string) { const cleaned = cleanChunkText(value || "") return cleaned || null } function isUsefulChunkText(text: string) { if (!text) { return false } if (text.length < 70) { return /error|vend|coin|bill|cool|stuck|motor|sensor|jam|door|part/i.test( text ) } const alphaChars = text.replace(/[^a-z]/gi, "").length if (alphaChars < 25) { return false } if (/^(table of contents|contact us)$/i.test(text)) { return false } return true } function normalizeText(value: string) { return value .normalize("NFKD") .replace(/[^a-zA-Z0-9]+/g, " ") .toLowerCase() .trim() } function tokenize(value: string) { return normalizeText(value) .split(" ") .map((token) => token.trim()) .filter( (token) => token.length > 1 && !STOPWORDS.has(token) && !/^page\d*$/.test(token) ) } function normalizeIdentifier(value: string) { return normalizeText(stripExtension(value)).replace(/\s+/g, "-") } function stripExtension(value: string) { return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "") } function canonicalManufacturer(value: string) { const normalized = normalizeText(value) for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) { if ( canonical === normalized || aliases.some((alias) => normalized.includes(normalizeText(alias))) ) { return canonical } } return normalized } function aliasesForManufacturer(value: string) { const canonical = canonicalManufacturer(value) return MANUFACTURER_ALIASES[canonical] || [value] } function guessModelFromManual(manual: Manual) { const filename = normalizeText(stripExtension(manual.filename || "")) const modelToken = extractModelTokens(filename)[0] return modelToken || null } function normalizeManualType(value?: string | null) { const normalized = normalizeText(value || "") if (!normalized) { return "manual" } if (normalized.includes("part")) { return "parts" } if (normalized.includes("operator")) { return "operator" } if (normalized.includes("service")) { return "service" } return normalized.replace(/\s+/g, "-") } function normalizeNullable(value?: string | null) { const normalized = cleanChunkText(value || "") return normalized || null } function extractModelTokens(value: string) { const matches = normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9]*\b/g) || [] return Array.from(new Set(matches)) } function looksLikeModelTokenQuery(value: string) { return extractModelTokens(value).length > 0 } function isRiskyManualQuery(value: string) { const normalized = normalizeText(value) return RISKY_MANUAL_KEYWORDS.some((keyword) => normalized.includes(normalizeText(keyword)) ) } function buildCitation( filename: string, manufacturer: string, pageNumber: number | null ) { return `${buildManualLabel(filename, manufacturer)}${ pageNumber ? `, page ${pageNumber}` : "" }` } function buildManualLabel(filename: string, manufacturer: string) { const stem = humanizeToken(stripExtension(filename)) const prefix = manufacturer ? `${manufacturer} ` : "" return `${prefix}${stem}`.trim() } function humanizeToken(value: string) { return value .replace(/[-_]+/g, " ") .replace(/\s+/g, " ") .trim() } function compareByScore(left: ManualCandidate, right: ManualCandidate) { return right.score - left.score } function clamp(value: number) { return Math.max(0, Math.min(1, value)) } function truncateText(value: string, maxLength: number) { if (value.length <= maxLength) { return value } return `${value.slice(0, maxLength - 1).trimEnd()}…` } async function readJsonFile(path: string) { return JSON.parse(await readFile(path, "utf8")) as T }