Rocky_Mountain_Vending/lib/manuals-knowledge.ts

1604 lines
39 KiB
TypeScript

import { existsSync } from "node:fs"
import { readFile, readdir } from "node:fs/promises"
import { basename, join } from "node:path"
import { listConvexManuals } from "@/lib/convex-service"
import { scanManuals } from "@/lib/manuals"
import { getManualsDataRoot } from "@/lib/manuals-paths"
import {
buildManualAssetUrl,
buildThumbnailAssetUrl,
} from "@/lib/manuals-storage"
import type { Manual } from "@/lib/manuals-types"
import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types"
const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"]
const STRUCTURED_MANUALS_DIR = [
...MANUALS_OPTIMIZED_ROOT,
"training_data",
"structured",
]
const EXTRACTED_CONTENT_FILE = [
...MANUALS_OPTIMIZED_ROOT,
"extracted_content",
"manuals_extracted_content.json",
]
const DEFAULT_MANUALS_PLATFORM_TENANT_ID = "rocky-mountain-vending"
const STOPWORDS = new Set([
"a",
"an",
"and",
"are",
"at",
"be",
"by",
"for",
"from",
"help",
"how",
"i",
"in",
"is",
"it",
"machine",
"manual",
"me",
"my",
"of",
"on",
"or",
"our",
"please",
"service",
"that",
"the",
"this",
"to",
"up",
"with",
])
const RISKY_MANUAL_KEYWORDS = [
"wiring",
"diagram",
"voltage",
"compressor",
"refrigerant",
"bypass",
"jumper",
"board level",
"schematic",
"electrical",
"rewire",
"disassemble",
"tear down",
]
const MANUAL_QUERY_HINTS = [
"manual",
"model",
"serial",
"error",
"code",
"parts",
"part",
"troubleshoot",
"troubleshooting",
"not cooling",
"not vending",
"coin",
"bill acceptor",
"bill",
"coin mech",
"validator",
"jam",
"stuck",
"door",
"display",
"keypad",
"compressor",
"motor",
"sensor",
]
const MANUFACTURER_ALIASES: Record<string, string[]> = {
"automatic products": ["automatic products", "automatic-products", "ap"],
"coinco": ["coinco"],
"crane": [
"crane",
"national",
"national vendors",
"merchant",
"merchant series",
"shopper",
],
"dixie narco": [
"dixie",
"narco",
"dixie narco",
"dixie-narco",
"dn",
"bevmax",
],
"gpl": ["gpl", "general products"],
"mei mars": ["mei", "mars", "mei mars", "bill validator"],
"royal vendors": ["royal", "royal vendors", "royal vendor", "rvv"],
"rowe": ["rowe"],
"seaga": ["seaga"],
"usi": ["usi", "u select it", "u-select-it", "uselectit"],
"vendo": ["vendo", "sanden"],
}
export type ManualCandidate = {
manualId: string
filename: string
manufacturer: string
category: string
manualUrl: string | null
thumbnailUrl: string | null
score: number
confidence: number
}
export type ManualKnowledgeChunk = {
manualId: string
filename: string
manufacturer: string
model: string | null
manualType: string
pageNumber: number | null
sectionTitle: string | null
text: string
manualUrl: string | null
thumbnailUrl: string | null
sourceConfidence: number
matchScore: number
citation: string
}
export type ManualKnowledgeFilters = {
manufacturer?: string | null
manualId?: string | null
model?: string | null
}
export type RetrieveManualContextResult = {
query: string
bestManual: ManualCandidate | null
manualCandidates: ManualCandidate[]
topChunks: ManualKnowledgeChunk[]
needsClarification: boolean
isRisky: boolean
}
export type ManualCitationContext = {
manual: ManualCandidate | null
citations: ManualKnowledgeChunk[]
}
export type ManualRetrievalSummary = {
ran: boolean
query: string
bestManualId: string | null
manualCandidateIds: string[]
topChunkCitations: string[]
needsClarification: boolean | null
isRisky: boolean | null
error: string | null
}
export interface ManualKnowledgeProvider {
findManualCandidates(query: string): Promise<ManualCandidate[]>
retrieveManualContext(
query: string,
filters?: ManualKnowledgeFilters
): Promise<RetrieveManualContextResult>
getManualCitationContext(
manualId: string,
pageNumber?: number
): Promise<ManualCitationContext>
}
type StructuredSection = {
title?: string
pageNumber?: number
text?: string
}
type StructuredTroubleshooting = {
problem?: string
solution?: string
pageNumber?: number
}
type StructuredPart = {
partNumber?: string
description?: string
pageNumber?: number
}
type StructuredManualRecord = {
manualId?: string
manufacturer?: string
model?: string
manualType?: string
content?: {
sections?: StructuredSection[]
troubleshooting?: StructuredTroubleshooting[]
partsDatabase?: StructuredPart[]
}
}
type ExtractedPage = {
pageNumber?: number
text?: string
wordCount?: number
}
type ExtractedPartList = {
pageNumber?: number
parts?: StructuredPart[]
}
type ExtractedManualRecord = {
filename?: string
sections?: StructuredSection[]
partsLists?: ExtractedPartList[]
text?: {
fullText?: string
pages?: ExtractedPage[]
}
}
type SharedPlatformManualRecord = {
manualId: string
filename: string
manufacturer: string
category: string | null
model: string | null
manualType: string
sourcePath?: string | null
manualUrl?: string | null
thumbnailUrl?: string | null
sourceFilenames?: string[]
}
type SharedPlatformChunkRecord = {
manualId: string
title?: string
manufacturer: string
model: string | null
manualType: string
pageNumber: number | null
sectionTitle: string | null
text: string
manualUrl?: string | null
thumbnailUrl?: string | null
metadataConfidence?: number
overallQuality?: number
}
type PreparedQuery = {
normalized: string
tokens: string[]
modelTokens: string[]
manufacturerFilters: string[]
manualIdFilter: string | null
}
type ManualInternal = {
manualId: string
filename: string
manufacturer: string
category: string
model: string | null
manualType: string
manualUrl: string | null
thumbnailUrl: string | null
searchText: string
tokenSet: Set<string>
normalizedManufacturer: string
}
type ChunkInternal = Omit<ManualKnowledgeChunk, "matchScore"> & {
searchText: string
tokenSet: Set<string>
matchScore: number
}
type ManualKnowledgeArtifact = {
manuals: ManualInternal[]
manualById: Map<string, ManualInternal>
chunks: ChunkInternal[]
chunksByManualId: Map<string, ChunkInternal[]>
}
type ResettableManualKnowledgeProvider = ManualKnowledgeProvider & {
resetCache(): void
}
let defaultProvider: ManualKnowledgeProvider | null = null
export function getManualKnowledgeProvider(): ManualKnowledgeProvider {
if (!defaultProvider) {
defaultProvider = new LocalManualKnowledgeProvider()
}
return defaultProvider
}
export async function findManualCandidates(query: string) {
return await getManualKnowledgeProvider().findManualCandidates(query)
}
export async function retrieveManualContext(
query: string,
filters?: ManualKnowledgeFilters
) {
return await getManualKnowledgeProvider().retrieveManualContext(query, filters)
}
export async function getManualCitationContext(
manualId: string,
pageNumber?: number
) {
return await getManualKnowledgeProvider().getManualCitationContext(
manualId,
pageNumber
)
}
export function resetManualKnowledgeCache() {
if (
defaultProvider &&
"resetCache" in defaultProvider &&
typeof (defaultProvider as ResettableManualKnowledgeProvider).resetCache ===
"function"
) {
;(defaultProvider as ResettableManualKnowledgeProvider).resetCache()
}
}
export function summarizeManualRetrieval(args: {
ran: boolean
query: string
result?: RetrieveManualContextResult | null
error?: unknown
}): ManualRetrievalSummary {
const error =
args.error instanceof Error
? args.error.message
: typeof args.error === "string"
? args.error
: null
if (!args.ran) {
return {
ran: false,
query: args.query,
bestManualId: null,
manualCandidateIds: [],
topChunkCitations: [],
needsClarification: null,
isRisky: null,
error,
}
}
return {
ran: true,
query: args.query,
bestManualId: args.result?.bestManual?.manualId || null,
manualCandidateIds: (args.result?.manualCandidates || [])
.slice(0, 4)
.map((candidate) => candidate.manualId),
topChunkCitations: (args.result?.topChunks || [])
.slice(0, 4)
.map((chunk) => chunk.citation),
needsClarification: args.result?.needsClarification ?? null,
isRisky: args.result?.isRisky ?? null,
error,
}
}
export function shouldUseManualKnowledgeForChat(
intent: string | null | undefined,
query: string
) {
const normalizedIntent = normalizeText(intent || "")
const normalizedQuery = normalizeText(query)
const hasManualHints =
looksLikeModelTokenQuery(normalizedQuery) ||
MANUAL_QUERY_HINTS.some((hint) => normalizedQuery.includes(hint))
if (!normalizedQuery) {
return false
}
if (
normalizedIntent.includes("manual") ||
normalizedIntent.includes("repair") ||
normalizedIntent.includes("parts")
) {
return hasManualHints
}
if (looksLikeModelTokenQuery(normalizedQuery)) {
return true
}
return hasManualHints
}
export function formatManualContextForPrompt(
result: RetrieveManualContextResult
) {
const lines = [
"Manual knowledge context:",
result.topChunks.length > 0
? "- Use only the excerpts below for any manuals, parts, or troubleshooting reply."
: "- No reliable manual excerpt was found.",
result.isRisky
? "- The question looks technical or risky. Stay high-level and safe, and do not provide procedural repair steps."
: "- Stay limited to simple identification, likely issue category, and very basic safe checks.",
]
if (result.manualCandidates.length > 0) {
lines.push("Likely manual candidates:")
for (const candidate of result.manualCandidates.slice(0, 3)) {
lines.push(
`- ${buildManualLabel(candidate.filename, candidate.manufacturer)}`
)
}
}
if (result.topChunks.length === 0) {
lines.push(
"- Ask for the brand on the front, model sticker, or a clear photo/video, and offer texting it in for the team to review."
)
return lines.join("\n")
}
if (result.needsClarification) {
lines.push(
"- Confidence is limited. Ask for the brand on the front, the model sticker, or a clear photo/video before sounding certain."
)
}
lines.push("Grounded excerpts:")
for (const chunk of result.topChunks.slice(0, 3)) {
lines.push(`- ${chunk.citation}: ${truncateText(chunk.text, 420)}`)
}
return lines.join("\n")
}
class LocalManualKnowledgeProvider implements ManualKnowledgeProvider {
private artifactPromise: Promise<ManualKnowledgeArtifact> | null = null
async findManualCandidates(query: string) {
const artifact = await this.getArtifact()
const prepared = prepareQuery(query)
return artifact.manuals
.map((manual) => buildCandidate(manual, scoreManual(manual, prepared)))
.filter((candidate) => candidate.score > 0)
.sort(compareByScore)
.slice(0, 6)
}
async retrieveManualContext(
query: string,
filters?: ManualKnowledgeFilters
): Promise<RetrieveManualContextResult> {
const artifact = await this.getArtifact()
const prepared = prepareQuery(query, filters)
const manualCandidates = artifact.manuals
.map((manual) => {
const score = scoreManual(manual, prepared)
return { manual, score }
})
.filter((entry) => entry.score > 0)
.sort((left, right) => right.score - left.score)
const candidateIds = new Set(
manualCandidates.slice(0, 10).map((entry) => entry.manual.manualId)
)
const topChunks = artifact.chunks
.map((chunk) => {
const manualScore =
manualCandidates.find((entry) => entry.manual.manualId === chunk.manualId)
?.score ?? 0
const score = scoreChunk(
chunk,
prepared,
candidateIds.size > 0 && candidateIds.has(chunk.manualId)
? manualScore
: manualScore * 0.35
)
return { chunk, score }
})
.filter((entry) => entry.score > 10)
.sort((left, right) => right.score - left.score)
.slice(0, 5)
.map(({ chunk, score }) => ({
...stripChunkForResponse(chunk),
matchScore: score,
}))
const needsClarification =
topChunks.length === 0 ||
topChunks[0].matchScore < 24 ||
(manualCandidates[0]
? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score)
.confidence < 0.48
: true)
const bestManualFromChunks =
topChunks.length > 0
? manualCandidates.find(
(entry) => entry.manual.manualId === topChunks[0].manualId
)
: null
const bestManual = bestManualFromChunks
? buildCandidate(bestManualFromChunks.manual, bestManualFromChunks.score)
: manualCandidates.length > 0
? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score)
: null
return {
query,
bestManual,
manualCandidates: manualCandidates
.slice(0, 4)
.map(({ manual, score }) => buildCandidate(manual, score)),
topChunks,
needsClarification,
isRisky: isRiskyManualQuery(query),
}
}
async getManualCitationContext(manualId: string, pageNumber?: number) {
const artifact = await this.getArtifact()
const manual = artifact.manualById.get(manualId) || null
const chunks = artifact.chunksByManualId.get(manualId) || []
const citations = chunks
.filter((chunk) =>
typeof pageNumber === "number" ? chunk.pageNumber === pageNumber : true
)
.slice(0, 5)
.map(stripChunkForResponse)
return {
manual: manual ? buildCandidate(manual, 1) : null,
citations,
}
}
private async getArtifact() {
if (!this.artifactPromise) {
this.artifactPromise = buildArtifact()
}
return await this.artifactPromise
}
resetCache() {
this.artifactPromise = null
}
}
async function buildArtifact(): Promise<ManualKnowledgeArtifact> {
const sharedArtifact = await loadSharedPlatformArtifact()
if (sharedArtifact) {
return sharedArtifact
}
const manuals = await loadManualCatalog()
const manualById = new Map<string, ManualInternal>()
for (const manual of manuals) {
manualById.set(manual.manualId, manual)
}
const chunks: ChunkInternal[] = []
const chunksByManualId = new Map<string, ChunkInternal[]>()
const extractedByFilename = await loadExtractedContentByFilename()
const structuredRecords = await loadStructuredManualRecords()
const manualsWithStructuredChunks = new Set<string>()
for (const record of structuredRecords) {
const matchedManual = matchStructuredRecordToManual(record, manuals)
const manual = ensureManualEntry({
manualById,
manuals,
matchedManual,
fallbackManualId: normalizeIdentifier(record.manualId || "unknown-manual"),
filename: matchedManual?.filename || `${record.manualId || "manual"}.pdf`,
manufacturer:
matchedManual?.manufacturer ||
humanizeToken(record.manufacturer || "Unknown"),
category: matchedManual?.category || record.manualType || "manual",
model: normalizeNullable(record.model),
manualType: normalizeManualType(record.manualType),
manualUrl: matchedManual?.manualUrl || null,
thumbnailUrl: matchedManual?.thumbnailUrl || null,
})
const structuredChunks = buildStructuredChunks(record, manual)
if (structuredChunks.length > 0) {
manualsWithStructuredChunks.add(manual.manualId)
addChunks(chunks, chunksByManualId, structuredChunks)
}
}
for (const extracted of extractedByFilename.values()) {
const manual = matchExtractedRecordToManual(extracted, manuals)
const sourceManual = manual
? ensureManualEntry({
manualById,
manuals,
matchedManual: manual,
fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "")),
filename: manual.filename,
manufacturer: manual.manufacturer,
category: manual.category,
model: manual.model,
manualType: manual.manualType,
manualUrl: manual.manualUrl,
thumbnailUrl: manual.thumbnailUrl,
})
: ensureManualEntry({
manualById,
manuals,
matchedManual: null,
fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "manual")),
filename: extracted.filename || "manual.pdf",
manufacturer: "Unknown",
category: "manual",
model: null,
manualType: "manual",
manualUrl: null,
thumbnailUrl: null,
})
if (manualsWithStructuredChunks.has(sourceManual.manualId)) {
continue
}
const fallbackChunks = buildExtractedFallbackChunks(extracted, sourceManual)
if (fallbackChunks.length > 0) {
addChunks(chunks, chunksByManualId, fallbackChunks)
}
}
return {
manuals,
manualById,
chunks,
chunksByManualId,
}
}
async function loadSharedPlatformArtifact(): Promise<ManualKnowledgeArtifact | null> {
const platformRoot = resolveManualsPlatformRoot()
if (!platformRoot) {
return null
}
const tenantId = getManualsPlatformTenantId()
const tenantDir = join(platformRoot, "output", "tenants", tenantId)
const manualsPath = join(tenantDir, "manuals.json")
const chunksPath = join(tenantDir, "chunks.json")
if (!existsSync(manualsPath) || !existsSync(chunksPath)) {
return null
}
const [manualRecords, chunkRecords] = await Promise.all([
readJsonFile<SharedPlatformManualRecord[]>(manualsPath),
readJsonFile<SharedPlatformChunkRecord[]>(chunksPath),
])
const manuals: ManualInternal[] = manualRecords.map((manual) => {
const searchTerms = [
manual.filename,
manual.sourcePath,
manual.manufacturer,
manual.category,
manual.model,
manual.manualType,
...(manual.sourceFilenames || []),
]
.filter(Boolean)
.join(" ")
return {
manualId: manual.manualId,
filename: manual.filename,
manufacturer: manual.manufacturer,
category: manual.category || manual.manualType || "manual",
model: manual.model || null,
manualType: normalizeManualType(manual.manualType),
manualUrl: toSiteManualUrl(manual.manualUrl || manual.sourcePath || null),
thumbnailUrl: toSiteThumbnailUrl(manual.thumbnailUrl || null),
searchText: normalizeText(searchTerms),
tokenSet: new Set(tokenize(searchTerms)),
normalizedManufacturer: canonicalManufacturer(manual.manufacturer),
}
})
const manualById = new Map<string, ManualInternal>()
for (const manual of manuals) {
manualById.set(manual.manualId, manual)
}
const chunks: ChunkInternal[] = chunkRecords.map((chunk) => {
const manual = manualById.get(chunk.manualId) || null
const filename = manual?.filename || humanizeToken(chunk.title || chunk.manualId)
const manufacturer = manual?.manufacturer || chunk.manufacturer
const manualUrl = manual?.manualUrl || toSiteManualUrl(chunk.manualUrl || null)
const thumbnailUrl =
manual?.thumbnailUrl || toSiteThumbnailUrl(chunk.thumbnailUrl || null)
const searchText = normalizeText(
[
filename,
manufacturer,
chunk.model,
chunk.sectionTitle,
chunk.text,
chunk.manualType,
]
.filter(Boolean)
.join(" ")
)
return {
manualId: chunk.manualId,
filename,
manufacturer,
model: chunk.model,
manualType: normalizeManualType(chunk.manualType),
pageNumber: chunk.pageNumber,
sectionTitle: chunk.sectionTitle,
text: chunk.text,
manualUrl,
thumbnailUrl,
sourceConfidence: clamp(
chunk.metadataConfidence ?? chunk.overallQuality ?? 0.76
),
matchScore: 0,
citation: buildCitation(filename, manufacturer, chunk.pageNumber),
searchText,
tokenSet: new Set(tokenize(searchText)),
}
})
const chunksByManualId = new Map<string, ChunkInternal[]>()
for (const chunk of chunks) {
const existing = chunksByManualId.get(chunk.manualId) || []
existing.push(chunk)
chunksByManualId.set(chunk.manualId, existing)
}
return {
manuals,
manualById,
chunks,
chunksByManualId,
}
}
function resolveManualsPlatformRoot() {
const candidates = [
process.env.MANUALS_PLATFORM_ROOT,
join(process.cwd(), "..", "manuals-platform"),
join(process.cwd(), "manuals-platform"),
].filter(Boolean) as string[]
for (const candidate of candidates) {
if (existsSync(candidate)) {
return candidate
}
}
return null
}
function getManualsPlatformTenantId() {
const value =
process.env.MANUALS_PLATFORM_TENANT_ID ||
process.env.SITE_MANUALS_TENANT_ID ||
DEFAULT_MANUALS_PLATFORM_TENANT_ID
return value.trim() || DEFAULT_MANUALS_PLATFORM_TENANT_ID
}
function toSiteManualUrl(value: string | null) {
if (!value) {
return null
}
if (/^https?:\/\//i.test(value)) {
return value
}
const relativePath = value.replace(/^manuals\//i, "").replace(/^\/manuals\//i, "")
return buildManualAssetUrl(relativePath)
}
function toSiteThumbnailUrl(value: string | null) {
if (!value) {
return null
}
if (/^https?:\/\//i.test(value)) {
return value
}
const relativePath = value
.replace(/^thumbnails\//i, "")
.replace(/^\/thumbnails\//i, "")
return buildThumbnailAssetUrl(relativePath)
}
async function loadManualCatalog() {
const convexManuals = await listConvexManuals().catch(() => [] as Manual[])
const filesystemManuals =
convexManuals.length > 0 ? [] : await scanManuals().catch(() => [] as Manual[])
const sourceManuals = convexManuals.length > 0 ? convexManuals : filesystemManuals
const deduped = new Map<string, ManualInternal>()
for (const manual of sourceManuals) {
const manualId = normalizeIdentifier(manual.path || manual.filename)
const manufacturer = manual.manufacturer || "Unknown"
const filename = manual.filename || basename(manual.path)
const category = manual.category || "manual"
const model = guessModelFromManual(manual)
const searchTerms = [
filename,
stripExtension(filename),
manual.path,
manufacturer,
category,
...(manual.searchTerms || []),
...(manual.commonNames || []),
...aliasesForManufacturer(manufacturer),
model || "",
]
.filter(Boolean)
.join(" ")
deduped.set(manualId, {
manualId,
filename,
manufacturer,
category,
model,
manualType: normalizeManualType(category),
manualUrl: getManualUrl(manual),
thumbnailUrl: getThumbnailUrl(manual),
searchText: normalizeText(searchTerms),
tokenSet: new Set(tokenize(searchTerms)),
normalizedManufacturer: canonicalManufacturer(manufacturer),
})
}
return Array.from(deduped.values())
}
async function loadStructuredManualRecords() {
const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR)
const entries = await readdir(directory, { withFileTypes: true })
const files = entries
.filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json"))
.map((entry) => entry.name)
return await Promise.all(
files.map(async (filename) => {
const path = join(directory, filename)
const parsed = await readJsonFile<StructuredManualRecord>(path)
return {
...parsed,
manualId: parsed.manualId || stripExtension(filename),
}
})
)
}
async function loadExtractedContentByFilename() {
const path = join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE)
const records = await readJsonFile<ExtractedManualRecord[]>(path)
const map = new Map<string, ExtractedManualRecord>()
for (const record of records) {
if (!record.filename) {
continue
}
map.set(normalizeIdentifier(record.filename), record)
}
return map
}
function matchStructuredRecordToManual(
record: StructuredManualRecord,
manuals: ManualInternal[]
) {
const query = [
record.manualId,
record.manufacturer,
record.model,
record.manualType,
]
.filter(Boolean)
.join(" ")
const prepared = prepareQuery(query, {
manufacturer: record.manufacturer,
model: record.model,
})
const best = manuals
.map((manual) => ({
manual,
score: scoreManual(manual, prepared),
}))
.sort((left, right) => right.score - left.score)[0]
return best && best.score >= 12 ? best.manual : null
}
function matchExtractedRecordToManual(
record: ExtractedManualRecord,
manuals: ManualInternal[]
) {
const filename = record.filename || ""
const normalizedFilename = normalizeIdentifier(filename)
const exactMatch = manuals.find(
(manual) =>
normalizeIdentifier(manual.filename) === normalizedFilename ||
normalizeIdentifier(manual.manualId) === normalizedFilename
)
if (exactMatch) {
return exactMatch
}
const query = stripExtension(filename)
const prepared = prepareQuery(query)
const best = manuals
.map((manual) => ({
manual,
score: scoreManual(manual, prepared),
}))
.sort((left, right) => right.score - left.score)[0]
return best && best.score >= 10 ? best.manual : null
}
function buildStructuredChunks(
record: StructuredManualRecord,
manual: ManualInternal
) {
const chunks: ChunkInternal[] = []
const sections = record.content?.sections || []
const troubleshooting = record.content?.troubleshooting || []
const parts = record.content?.partsDatabase || []
for (const section of sections) {
const text = cleanChunkText(section.text || "")
if (!isUsefulChunkText(text)) {
continue
}
chunks.push(
makeChunk({
manual,
pageNumber: section.pageNumber ?? null,
sectionTitle: cleanSectionTitle(section.title),
text,
sourceConfidence: 0.92,
})
)
}
for (const item of troubleshooting) {
const problem = cleanChunkText(item.problem || "")
const solution = cleanChunkText(item.solution || "")
const text = cleanChunkText(
[problem ? `Problem: ${problem}` : "", solution ? `Likely cause or solution: ${solution}` : ""]
.filter(Boolean)
.join("\n")
)
if (!isUsefulChunkText(text)) {
continue
}
chunks.push(
makeChunk({
manual,
pageNumber: item.pageNumber ?? null,
sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting",
text,
sourceConfidence: 0.95,
})
)
}
const partChunks = buildPartChunks(parts, manual, 0.8)
addChunks(chunks, new Map<string, ChunkInternal[]>(), partChunks)
return chunks
}
function buildExtractedFallbackChunks(
record: ExtractedManualRecord,
manual: ManualInternal
) {
const chunks: ChunkInternal[] = []
const pages = record.text?.pages || []
for (const page of pages) {
const text = cleanChunkText(page.text || "")
if (!isUsefulChunkText(text)) {
continue
}
chunks.push(
makeChunk({
manual,
pageNumber: page.pageNumber ?? null,
sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "Manual page",
text,
sourceConfidence: 0.72,
})
)
}
const partChunks = buildExtractedPartListChunks(record.partsLists || [], manual)
addChunks(chunks, new Map<string, ChunkInternal[]>(), partChunks)
return chunks
}
function buildPartChunks(
parts: StructuredPart[],
manual: ManualInternal,
sourceConfidence: number
) {
const partsByPage = new Map<number, string[]>()
for (const part of parts) {
const partNumber = cleanChunkText(part.partNumber || "")
const description = cleanChunkText(part.description || "")
if (partNumber.length < 2 && description.length < 6) {
continue
}
const pageNumber = part.pageNumber ?? 0
const summary = description
? `Part ${partNumber}: ${description}`
: `Part ${partNumber}`
const bucket = partsByPage.get(pageNumber) || []
if (bucket.length < 8) {
bucket.push(summary)
partsByPage.set(pageNumber, bucket)
}
}
return Array.from(partsByPage.entries()).map(([pageNumber, summaries]) =>
makeChunk({
manual,
pageNumber: pageNumber || null,
sectionTitle: "Parts reference",
text: summaries.join("\n"),
sourceConfidence,
})
)
}
function buildExtractedPartListChunks(
partLists: ExtractedPartList[],
manual: ManualInternal
) {
const flattened: StructuredPart[] = []
for (const partList of partLists) {
for (const part of partList.parts || []) {
flattened.push({
...part,
pageNumber: part.pageNumber ?? partList.pageNumber,
})
}
}
return buildPartChunks(flattened, manual, 0.76)
}
function addChunks(
target: ChunkInternal[],
chunksByManualId: Map<string, ChunkInternal[]>,
chunks: ChunkInternal[]
) {
for (const chunk of chunks) {
target.push(chunk)
const existing = chunksByManualId.get(chunk.manualId) || []
existing.push(chunk)
chunksByManualId.set(chunk.manualId, existing)
}
}
function ensureManualEntry(args: {
manualById: Map<string, ManualInternal>
manuals: ManualInternal[]
matchedManual: ManualInternal | null
fallbackManualId: string
filename: string
manufacturer: string
category: string
model: string | null
manualType: string
manualUrl: string | null
thumbnailUrl: string | null
}) {
if (args.matchedManual) {
return args.matchedManual
}
const manualId = normalizeIdentifier(args.fallbackManualId)
const existing = args.manualById.get(manualId)
if (existing) {
return existing
}
const fallback: ManualInternal = {
manualId,
filename: args.filename,
manufacturer: args.manufacturer,
category: args.category,
model: args.model,
manualType: args.manualType,
manualUrl: args.manualUrl,
thumbnailUrl: args.thumbnailUrl,
searchText: normalizeText(
[
args.filename,
args.manufacturer,
args.category,
args.model,
args.manualType,
]
.filter(Boolean)
.join(" ")
),
tokenSet: new Set(
tokenize(
[
args.filename,
args.manufacturer,
args.category,
args.model,
args.manualType,
]
.filter(Boolean)
.join(" ")
)
),
normalizedManufacturer: canonicalManufacturer(args.manufacturer),
}
args.manuals.push(fallback)
args.manualById.set(fallback.manualId, fallback)
return fallback
}
function prepareQuery(query: string, filters?: ManualKnowledgeFilters): PreparedQuery {
const normalized = normalizeText(query)
const tokenSet = new Set(tokenize(normalized))
const modelTokens = extractModelTokens(normalized)
const manufacturerFilters = new Set<string>()
for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
for (const alias of aliases) {
if (normalized.includes(normalizeText(alias))) {
manufacturerFilters.add(canonical)
}
}
}
if (filters?.manufacturer) {
manufacturerFilters.add(canonicalManufacturer(filters.manufacturer))
}
if (filters?.model) {
tokenSet.add(normalizeText(filters.model))
}
return {
normalized,
tokens: Array.from(tokenSet),
modelTokens,
manufacturerFilters: Array.from(manufacturerFilters),
manualIdFilter: filters?.manualId
? normalizeIdentifier(filters.manualId)
: null,
}
}
function scoreManual(manual: ManualInternal, query: PreparedQuery) {
if (!query.normalized) {
return 0
}
let score = 0
if (query.manualIdFilter) {
score += query.manualIdFilter === manual.manualId ? 28 : -12
}
if (query.manufacturerFilters.length > 0) {
score += query.manufacturerFilters.includes(manual.normalizedManufacturer)
? 14
: -6
}
if (manual.searchText.includes(query.normalized)) {
score += 24
}
for (const token of query.tokens) {
if (!token) {
continue
}
if (manual.tokenSet.has(token)) {
score += query.modelTokens.includes(token) ? 8 : 4
continue
}
if (token.length >= 4 && manual.searchText.includes(token)) {
score += 1.5
}
}
for (const token of query.modelTokens) {
if (manual.searchText.includes(token)) {
score += 8
}
}
if (
query.normalized.includes("error") ||
query.normalized.includes("not ") ||
query.normalized.includes("coin") ||
query.normalized.includes("bill")
) {
if (
manual.manualType.includes("repair") ||
manual.manualType.includes("service") ||
manual.manualType.includes("parts")
) {
score += 6
}
}
return score
}
function scoreChunk(
chunk: ChunkInternal,
query: PreparedQuery,
manualScore: number
) {
if (!query.normalized) {
return manualScore
}
let score = manualScore + chunk.sourceConfidence * 8
if (query.manufacturerFilters.length > 0) {
score += query.manufacturerFilters.includes(canonicalManufacturer(chunk.manufacturer))
? 6
: -4
}
if (chunk.searchText.includes(query.normalized)) {
score += 18
}
for (const token of query.tokens) {
if (!token) {
continue
}
if (chunk.tokenSet.has(token)) {
score += query.modelTokens.includes(token) ? 7 : 3
continue
}
if (token.length >= 5 && chunk.searchText.includes(token)) {
score += 1
}
}
if (
query.normalized.includes("parts") ||
query.normalized.includes("part") ||
query.normalized.includes("bill") ||
query.normalized.includes("coin")
) {
if ((chunk.sectionTitle || "").toLowerCase().includes("parts")) {
score += 6
}
}
if (
query.normalized.includes("error") ||
query.normalized.includes("not ") ||
query.normalized.includes("won t") ||
query.normalized.includes("wont")
) {
if ((chunk.sectionTitle || "").toLowerCase().includes("troubleshooting")) {
score += 5
}
}
return score
}
function buildCandidate(manual: ManualInternal, score: number): ManualCandidate {
return {
manualId: manual.manualId,
filename: manual.filename,
manufacturer: manual.manufacturer,
category: manual.category,
manualUrl: manual.manualUrl,
thumbnailUrl: manual.thumbnailUrl,
score,
confidence: clamp(score / 38),
}
}
function makeChunk(args: {
manual: ManualInternal
pageNumber: number | null
sectionTitle: string | null
text: string
sourceConfidence: number
}) {
const searchText = normalizeText(
[
args.manual.filename,
args.manual.manufacturer,
args.manual.model,
args.sectionTitle,
args.text,
]
.filter(Boolean)
.join(" ")
)
return {
manualId: args.manual.manualId,
filename: args.manual.filename,
manufacturer: args.manual.manufacturer,
model: args.manual.model,
manualType: args.manual.manualType,
pageNumber: args.pageNumber,
sectionTitle: args.sectionTitle,
text: args.text,
manualUrl: args.manual.manualUrl,
thumbnailUrl: args.manual.thumbnailUrl,
sourceConfidence: args.sourceConfidence,
matchScore: 0,
citation: buildCitation(
args.manual.filename,
args.manual.manufacturer,
args.pageNumber
),
searchText,
tokenSet: new Set(tokenize(searchText)),
} satisfies ChunkInternal
}
function stripChunkForResponse(chunk: ChunkInternal): ManualKnowledgeChunk {
return {
manualId: chunk.manualId,
filename: chunk.filename,
manufacturer: chunk.manufacturer,
model: chunk.model,
manualType: chunk.manualType,
pageNumber: chunk.pageNumber,
sectionTitle: chunk.sectionTitle,
text: chunk.text,
manualUrl: chunk.manualUrl,
thumbnailUrl: chunk.thumbnailUrl,
sourceConfidence: chunk.sourceConfidence,
matchScore: chunk.matchScore,
citation: chunk.citation,
}
}
function cleanChunkText(value: string) {
return value
.replace(/\u00ad/g, "")
.replace(/\s+/g, " ")
.trim()
}
function cleanSectionTitle(value?: string) {
const cleaned = cleanChunkText(value || "")
return cleaned || null
}
function isUsefulChunkText(text: string) {
if (!text) {
return false
}
if (text.length < 70) {
return /error|vend|coin|bill|cool|stuck|motor|sensor|jam|door|part/i.test(
text
)
}
const alphaChars = text.replace(/[^a-z]/gi, "").length
if (alphaChars < 25) {
return false
}
if (/^(table of contents|contact us)$/i.test(text)) {
return false
}
return true
}
function normalizeText(value: string) {
return value
.normalize("NFKD")
.replace(/[^a-zA-Z0-9]+/g, " ")
.toLowerCase()
.trim()
}
function tokenize(value: string) {
return normalizeText(value)
.split(" ")
.map((token) => token.trim())
.filter(
(token) =>
token.length > 1 &&
!STOPWORDS.has(token) &&
!/^page\d*$/.test(token)
)
}
function normalizeIdentifier(value: string) {
return normalizeText(stripExtension(value)).replace(/\s+/g, "-")
}
function stripExtension(value: string) {
return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "")
}
function canonicalManufacturer(value: string) {
const normalized = normalizeText(value)
for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
if (
canonical === normalized ||
aliases.some((alias) => normalized.includes(normalizeText(alias)))
) {
return canonical
}
}
return normalized
}
function aliasesForManufacturer(value: string) {
const canonical = canonicalManufacturer(value)
return MANUFACTURER_ALIASES[canonical] || [value]
}
function guessModelFromManual(manual: Manual) {
const filename = normalizeText(stripExtension(manual.filename || ""))
const modelToken = extractModelTokens(filename)[0]
return modelToken || null
}
function normalizeManualType(value?: string | null) {
const normalized = normalizeText(value || "")
if (!normalized) {
return "manual"
}
if (normalized.includes("part")) {
return "parts"
}
if (normalized.includes("operator")) {
return "operator"
}
if (normalized.includes("service")) {
return "service"
}
return normalized.replace(/\s+/g, "-")
}
function normalizeNullable(value?: string | null) {
const normalized = cleanChunkText(value || "")
return normalized || null
}
function extractModelTokens(value: string) {
const matches =
normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9]*\b/g) || []
return Array.from(new Set(matches))
}
function looksLikeModelTokenQuery(value: string) {
return extractModelTokens(value).length > 0
}
function isRiskyManualQuery(value: string) {
const normalized = normalizeText(value)
return RISKY_MANUAL_KEYWORDS.some((keyword) =>
normalized.includes(normalizeText(keyword))
)
}
function buildCitation(
filename: string,
manufacturer: string,
pageNumber: number | null
) {
return `${buildManualLabel(filename, manufacturer)}${
pageNumber ? `, page ${pageNumber}` : ""
}`
}
function buildManualLabel(filename: string, manufacturer: string) {
const stem = humanizeToken(stripExtension(filename))
const prefix = manufacturer ? `${manufacturer} ` : ""
return `${prefix}${stem}`.trim()
}
function humanizeToken(value: string) {
return value
.replace(/[-_]+/g, " ")
.replace(/\s+/g, " ")
.trim()
}
function compareByScore(left: ManualCandidate, right: ManualCandidate) {
return right.score - left.score
}
function clamp(value: number) {
return Math.max(0, Math.min(1, value))
}
function truncateText(value: string, maxLength: number) {
if (value.length <= maxLength) {
return value
}
return `${value.slice(0, maxLength - 1).trimEnd()}`
}
async function readJsonFile<T>(path: string) {
return JSON.parse(await readFile(path, "utf8")) as T
}