import { mkdir, readFile, readdir, writeFile } from "node:fs/promises" import { basename, join } from "node:path" import { scanManuals } from "@/lib/manuals" import { getManualsDataRoot } from "@/lib/manuals-paths" import type { Manual } from "@/lib/manuals-types" import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types" const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"] const STRUCTURED_MANUALS_DIR = [ ...MANUALS_OPTIMIZED_ROOT, "training_data", "structured", ] const STRUCTURED_MANUALS_INDEX_FILE = [ ...MANUALS_OPTIMIZED_ROOT, "training_data", "all_structured_data.json", ] const EXTRACTED_CONTENT_FILE = [ ...MANUALS_OPTIMIZED_ROOT, "extracted_content", "manuals_extracted_content.json", ] const STOPWORDS = new Set([ "a", "an", "and", "are", "at", "be", "by", "for", "from", "in", "is", "it", "manual", "machine", "of", "on", "or", "service", "the", "to", "with", ]) const TECH_RISK_KEYWORDS = [ "wiring", "diagram", "schematic", "electrical", "voltage", "jumper", "compressor", "refrigerant", "bypass", "board level", ] const MARKETING_KEYWORDS = [ "increase sales", "more profits", "contact us", "operator can double up", "your employees", "productivity", "variety", "brochure", ] const SPECS_KEYWORDS = [ "dimensions", "height:", "width:", "depth:", "shipping weight", "electrical:", "listings:", "capacity", "voltage", ] const TROUBLESHOOTING_KEYWORDS = [ "probable cause", "solution", "troubleshooting", "not accepting", "will not vend", "check fuse", "error code", ] const OPERATOR_KEYWORDS = [ "user guide", "operators guide", "operation", "programming", "setup guide", "how to", ] const PARTS_KEYWORDS = [ "parts manual", "parts reference", "part number", "parts list", "exploded view", ] const MANUFACTURER_ALIASES: Record = { "AP": [ "ap", "automatic products", "automatic-products", "snackshop", ], "Other": ["other", "unknown", "bill mechs", "coin mechs"], "Coinco": ["coinco"], "Crane": [ "crane", "national vendors", "national", "merchant", "merchant series", ], "Dixie-Narco": ["dixie", "narco", "dixie narco", "dixie-narco", "bevmax"], "GPL": ["gpl", "general products"], "MEI Mars": ["mei", "mars", "bill validator"], "Royal Vendors": ["royal", "royal vendors", "royal vendor", "rvv"], "Rowe": ["rowe"], "Seaga": ["seaga"], "USI": ["usi", "u select it", "u-select-it", "uselectit"], "Vendo": ["vendo", "sanden"], } export type ManualsQdrantProfile = "public_safe" | "internal_tech" export type ManualsQdrantChunkLabel = | "brochure" | "flowchart" | "operator" | "parts" | "service" | "specs" | "toc" | "troubleshooting" | "wiring" | "general" export type ManualsEmbeddingTier = | "high_confidence" | "fallback" | "exclude" export type ManualsQdrantManual = { manualId: string title: string manufacturer: string manufacturerCanonical: string model: string | null manualType: string category: string | null manualUrl: string | null thumbnailUrl: string | null sourceFilenames: string[] sourceRecordCount: number metadataConfidence: number parseQuality: number duplicateRisk: number chunkCount: number highConfidenceChunkCount: number profiles: ManualsQdrantProfile[] embeddingTier: ManualsEmbeddingTier flags: string[] } export type ManualsQdrantChunk = { chunkId: string manualId: string title: string manufacturer: string manufacturerCanonical: string model: string | null manualType: string category: string | null pageNumber: number | null sectionTitle: string | null text: string sourceFilename: string | null sourceKind: "ocr_page" | "parts_database" | "structured_section" | "troubleshooting" labels: ManualsQdrantChunkLabel[] manualUrl: string | null thumbnailUrl: string | null metadataConfidence: number textQuality: number overallQuality: number embeddingTier: ManualsEmbeddingTier profiles: ManualsQdrantProfile[] isRisky: boolean flags: string[] } export type ManualsQdrantCorpusStats = { catalogManuals: number structuredRecords: number extractedRecords: number normalizedManuals: number chunkCount: number highConfidenceChunks: number fallbackChunks: number excludedChunks: number manualsByManufacturer: Record chunksByLabel: Record profileCounts: Record } export type ManualsQdrantCorpus = { generatedAt: string stats: ManualsQdrantCorpusStats manuals: ManualsQdrantManual[] chunks: ManualsQdrantChunk[] } export type ManualsQdrantEvaluationCase = { id: string query: string profile: ManualsQdrantProfile expectedManufacturer?: string expectedChunkLabels?: ManualsQdrantChunkLabel[] disallowedChunkLabels?: ManualsQdrantChunkLabel[] } export type ManualsQdrantSearchResult = { chunk: ManualsQdrantChunk score: number } export type ManualsQdrantEvaluationResult = { cases: Array<{ id: string query: string profile: ManualsQdrantProfile passedTop3Manufacturer: boolean | null passedTop5Label: boolean passedDisallowedCheck: boolean topManufacturers: string[] topLabels: ManualsQdrantChunkLabel[] }> summary: { totalCases: number top3ManufacturerPasses: number labelPasses: number disallowedPasses: number } } type StructuredSection = { title?: string pageNumber?: number text?: string } type StructuredTroubleshooting = { problem?: string solution?: string pageNumber?: number } type StructuredPart = { partNumber?: string description?: string pageNumber?: number } type StructuredManualRecord = { manualId?: string manufacturer?: string model?: string manualType?: string sourceFilename?: string metadata?: { pageCount?: number } content?: { sections?: StructuredSection[] troubleshooting?: StructuredTroubleshooting[] partsDatabase?: StructuredPart[] specifications?: Record } } type ExtractedPage = { pageNumber?: number text?: string wordCount?: number } type ExtractedPartList = { pageNumber?: number parts?: StructuredPart[] } type ExtractedManualRecord = { filename?: string filepath?: string text?: { fullText?: string pages?: ExtractedPage[] } partsLists?: ExtractedPartList[] sections?: StructuredSection[] } type CatalogManual = { manual: Manual manufacturerCanonical: string modelGuess: string | null searchText: string tokenSet: Set } type ManualAccumulator = Omit< ManualsQdrantManual, | "metadataConfidence" | "parseQuality" | "duplicateRisk" | "chunkCount" | "highConfidenceChunkCount" | "profiles" | "embeddingTier" | "flags" > & { metadataConfidenceTotal: number metadataConfidenceSamples: number hasStructured: boolean hasTroubleshooting: boolean hasOcrText: boolean chunks: ManualsQdrantChunk[] flagsSet: Set } const DEFAULT_EVAL_CASES: ManualsQdrantEvaluationCase[] = [ { id: "rvv-660-service", query: "RVV 660 service manual", profile: "internal_tech", expectedManufacturer: "Royal Vendors", }, { id: "narco-bevmax-cooling", query: "Narco bevmax not cooling", profile: "public_safe", expectedManufacturer: "Dixie-Narco", expectedChunkLabels: ["service", "troubleshooting"], }, { id: "coin-mech-dollars", query: "coin mech not accepting dollars", profile: "public_safe", expectedChunkLabels: ["troubleshooting", "parts"], disallowedChunkLabels: ["brochure"], }, { id: "royal-coins", query: "Royal machine not accepting coins", profile: "public_safe", expectedManufacturer: "Royal Vendors", expectedChunkLabels: ["troubleshooting"], }, { id: "wiring-risky", query: "Royal wiring diagram voltage issue", profile: "public_safe", expectedManufacturer: "Royal Vendors", disallowedChunkLabels: ["wiring"], }, { id: "ambiguous-bad-query", query: "manual for strange mystery vendor", profile: "public_safe", disallowedChunkLabels: ["brochure"], }, ] export function getDefaultManualsQdrantEvaluationCases() { return DEFAULT_EVAL_CASES } let manualsQdrantCorpusPromise: Promise | null = null export function getManualsQdrantCorpus() { if (!manualsQdrantCorpusPromise) { manualsQdrantCorpusPromise = buildManualsQdrantCorpus() } return manualsQdrantCorpusPromise } export function resetManualsQdrantCorpusCache() { manualsQdrantCorpusPromise = null } export async function buildManualsQdrantCorpus(): Promise { const catalogManuals = await loadCatalogManuals() const structuredRecords = await loadStructuredRecords() const extractedRecords = await loadExtractedRecords() const manuals = new Map() const chunkDedup = new Set() for (const record of structuredRecords) { const catalogMatch = matchCatalogManual( [record.manualId, record.manufacturer, record.model] .filter(Boolean) .join(" "), catalogManuals, { manufacturerHint: record.manufacturer || null, modelHint: record.model || null, } ) const filenameHint = catalogMatch?.manual.filename || record.sourceFilename || `${record.manualId || "manual"}.pdf` const manual = getOrCreateManualAccumulator({ manuals, catalogMatch, filename: filenameHint, recordManufacturer: record.manufacturer || null, recordModel: record.model || null, manualTypeHint: detectManualType( `${record.manualType || ""} ${record.manualId || ""}` ), categoryHint: catalogMatch?.manual.category || null, metadataConfidence: catalogMatch ? 0.86 : 0.32, sourceRecordId: record.sourceFilename || record.manualId || filenameHint, }) manual.hasStructured = true manual.hasTroubleshooting ||= Boolean(record.content?.troubleshooting?.length) for (const chunk of buildStructuredChunks(record, manual)) { addChunkToManual(manual, chunk, chunkDedup) } } for (const record of extractedRecords) { const filename = record.filename || basename(record.filepath || "manual.pdf") const catalogMatch = matchCatalogManual(filename, catalogManuals) const manual = getOrCreateManualAccumulator({ manuals, catalogMatch, filename, recordManufacturer: null, recordModel: null, manualTypeHint: detectManualType(filename), categoryHint: catalogMatch?.manual.category || null, metadataConfidence: catalogMatch && normalizeIdentifier(catalogMatch.manual.filename) === normalizeIdentifier(filename) ? 0.96 : catalogMatch ? 0.78 : 0.36, sourceRecordId: record.filename || record.filepath || "unknown-extracted", }) manual.hasOcrText ||= hasUsefulOcrText(record) // Prefer structured/manual chunks where they exist and use OCR pages only // as a fallback corpus for manuals we could not parse structurally. if (manual.hasStructured) { continue } for (const chunk of buildExtractedChunks(record, manual)) { addChunkToManual(manual, chunk, chunkDedup) } } const finalizedManuals = Array.from(manuals.values()) .map(finalizeManual) .sort((left, right) => left.manualId.localeCompare(right.manualId)) const finalizedChunks = finalizedManuals .flatMap((manual) => manual.chunks) .sort((left, right) => left.chunkId.localeCompare(right.chunkId)) const stats = buildCorpusStats({ catalogManuals, structuredRecords, extractedRecords, manuals: finalizedManuals, chunks: finalizedChunks, }) return { generatedAt: new Date().toISOString(), stats, manuals: finalizedManuals, chunks: finalizedChunks, } } export function searchManualsQdrantCorpus( corpus: ManualsQdrantCorpus, query: string, options?: { limit?: number profile?: ManualsQdrantProfile } ): ManualsQdrantSearchResult[] { const limit = options?.limit ?? 5 const profile = options?.profile ?? "internal_tech" const normalizedQuery = normalizeText(query) const queryTokens = tokenize(normalizedQuery) const queryLower = normalizedQuery.toLowerCase() return corpus.chunks .filter((chunk) => { return ( chunk.embeddingTier !== "exclude" && chunk.profiles.includes(profile) && chunk.text.trim().length > 0 ) }) .map((chunk) => ({ chunk, score: scoreChunkForQuery(chunk, queryTokens, queryLower), })) .filter((entry) => entry.score > 0) .sort((left, right) => right.score - left.score) .slice(0, limit) } export function evaluateManualsQdrantCorpus( corpus: ManualsQdrantCorpus, cases: ManualsQdrantEvaluationCase[] = DEFAULT_EVAL_CASES ): ManualsQdrantEvaluationResult { const results = cases.map((evaluationCase) => { const topResults = searchManualsQdrantCorpus(corpus, evaluationCase.query, { limit: 5, profile: evaluationCase.profile, }) const topManufacturers = Array.from( new Set(topResults.map((result) => result.chunk.manufacturer)) ) const topLabels = Array.from( new Set(topResults.flatMap((result) => result.chunk.labels)) ) return { id: evaluationCase.id, query: evaluationCase.query, profile: evaluationCase.profile, passedTop3Manufacturer: evaluationCase.expectedManufacturer ? topManufacturers .slice(0, 3) .includes(evaluationCase.expectedManufacturer) : null, passedTop5Label: evaluationCase.expectedChunkLabels ? evaluationCase.expectedChunkLabels.some((label) => topLabels.includes(label) ) : true, passedDisallowedCheck: evaluationCase.disallowedChunkLabels ? !topLabels.some((label) => evaluationCase.disallowedChunkLabels?.includes(label) ) : true, topManufacturers, topLabels, } }) return { cases: results, summary: { totalCases: results.length, top3ManufacturerPasses: results.filter( (result) => result.passedTop3Manufacturer !== false ).length, labelPasses: results.filter((result) => result.passedTop5Label).length, disallowedPasses: results.filter( (result) => result.passedDisallowedCheck ).length, }, } } export async function writeManualsQdrantArtifacts(args?: { outputDir?: string }) { const outputDir = args?.outputDir || join(process.cwd(), "output", "manuals-qdrant") const corpus = await buildManualsQdrantCorpus() const evaluation = evaluateManualsQdrantCorpus(corpus) const internalTechChunks = corpus.chunks.filter((chunk) => chunk.profiles.includes("internal_tech") ) const publicSafeChunks = corpus.chunks.filter((chunk) => chunk.profiles.includes("public_safe") ) const highConfidenceChunks = corpus.chunks.filter( (chunk) => chunk.embeddingTier === "high_confidence" ) await mkdir(outputDir, { recursive: true }) await writeFile( join(outputDir, "summary.json"), JSON.stringify( { generatedAt: corpus.generatedAt, stats: corpus.stats, evaluation: evaluation.summary, }, null, 2 ) ) await writeFile( join(outputDir, "manuals.json"), JSON.stringify(corpus.manuals, null, 2) ) await writeFile( join(outputDir, "chunks.json"), JSON.stringify(corpus.chunks, null, 2) ) await writeFile( join(outputDir, "chunks-internal-tech.json"), JSON.stringify(internalTechChunks, null, 2) ) await writeFile( join(outputDir, "chunks-public-safe.json"), JSON.stringify(publicSafeChunks, null, 2) ) await writeFile( join(outputDir, "chunks-high-confidence.json"), JSON.stringify(highConfidenceChunks, null, 2) ) await writeFile( join(outputDir, "evaluation-cases.json"), JSON.stringify(DEFAULT_EVAL_CASES, null, 2) ) await writeFile( join(outputDir, "evaluation-report.json"), JSON.stringify(evaluation, null, 2) ) return { outputDir, corpus, evaluation, } } async function loadCatalogManuals() { const manuals = await scanManuals() const catalog = manuals.map((manual) => { const title = [ manual.filename, manual.manufacturer, manual.category, ...(manual.searchTerms || []), ...(manual.commonNames || []), ] .filter(Boolean) .join(" ") return { manual, manufacturerCanonical: normalizeManufacturer(manual.manufacturer), modelGuess: extractModel(title), searchText: normalizeText(title), tokenSet: new Set(tokenize(title)), } satisfies CatalogManual }) return catalog } async function loadStructuredRecords() { const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR) const entries = await readdir(directory, { withFileTypes: true }) const files = entries .filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json")) .map((entry) => entry.name) const records = await Promise.all( files.map(async (filename) => { const parsed = await readJsonFile( join(directory, filename) ) return { ...parsed, sourceFilename: filename, manualId: parsed.manualId || stripExtension(filename), } }) ) const indexRecords = await readJsonFile( join(getManualsDataRoot(), ...STRUCTURED_MANUALS_INDEX_FILE) ) if (indexRecords.length === 0) { return records } const recordsByKey = new Map() for (const record of records) { const key = getStructuredRecordMatchKey(record) const existing = recordsByKey.get(key) || [] existing.push(record.sourceFilename || `${record.manualId || "structured"}.json`) recordsByKey.set(key, existing) } return indexRecords.map((record, index) => { const key = getStructuredRecordMatchKey(record) const matchingFilenames = recordsByKey.get(key) || [] return { ...record, sourceFilename: matchingFilenames.shift() || `${normalizeIdentifier(record.manualId || `structured-record-${index + 1}`)}.json`, } }) } async function loadExtractedRecords() { return await readJsonFile( join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE) ) } function matchCatalogManual( rawQuery: string, catalogManuals: CatalogManual[], hints?: { manufacturerHint?: string | null modelHint?: string | null } ) { const normalizedQuery = normalizeText(rawQuery) const tokens = tokenize(normalizedQuery) const manufacturerHint = hints?.manufacturerHint ? normalizeManufacturer(hints.manufacturerHint) : null const modelHint = hints?.modelHint ? normalizeIdentifier(hints.modelHint) : null const exactStemMatch = catalogManuals.find( (catalogManual) => normalizeIdentifier(stripExtension(catalogManual.manual.filename)) === normalizeIdentifier(stripExtension(rawQuery)) ) if (exactStemMatch) { return exactStemMatch } const scored = catalogManuals .map((catalogManual) => { let score = 0 if (manufacturerHint) { score += catalogManual.manufacturerCanonical === manufacturerHint ? 16 : -4 } if (modelHint) { if (catalogManual.modelGuess === modelHint) { score += 14 } else if ( catalogManual.searchText.includes(modelHint.replace(/-/g, " ")) ) { score += 8 } } if ( normalizedQuery && catalogManual.searchText.includes(normalizedQuery.toLowerCase()) ) { score += 20 } for (const token of tokens) { if (catalogManual.tokenSet.has(token)) { score += 4 } else if ( token.length >= 4 && catalogManual.searchText.includes(token) ) { score += 1.5 } } return { catalogManual, score } }) .sort((left, right) => right.score - left.score) return scored[0] && scored[0].score >= 10 ? scored[0].catalogManual : null } function getOrCreateManualAccumulator(args: { manuals: Map catalogMatch: CatalogManual | null filename: string recordManufacturer: string | null recordModel: string | null manualTypeHint: string categoryHint: string | null metadataConfidence: number sourceRecordId: string }) { const manual = args.catalogMatch?.manual const manufacturer = humanizeManufacturer( manual?.manufacturer || args.recordManufacturer || "Other" ) const model = args.recordModel && !isPlaceholderValue(args.recordModel) ? sanitizeModel(args.recordModel) : args.catalogMatch?.modelGuess || extractModel(args.filename) const manualType = args.manualTypeHint || detectManualType(args.filename) const manualId = buildCanonicalManualId({ catalogManual: manual || null, manufacturer, model, manualType, filename: args.filename, }) const existing = args.manuals.get(manualId) if (existing) { existing.sourceFilenames = Array.from( new Set([ ...existing.sourceFilenames, args.filename, ...(args.sourceRecordId.toLowerCase().endsWith(".json") ? [args.sourceRecordId] : []), ]) ) existing.sourceRecordCount += 1 existing.metadataConfidenceTotal += args.metadataConfidence existing.metadataConfidenceSamples += 1 if (args.categoryHint && !existing.category) { existing.category = args.categoryHint } if (args.recordModel && !existing.model && !isPlaceholderValue(args.recordModel)) { existing.model = sanitizeModel(args.recordModel) } if (args.recordManufacturer && existing.manufacturer === "Other") { existing.manufacturer = humanizeManufacturer(args.recordManufacturer) existing.manufacturerCanonical = normalizeManufacturer(existing.manufacturer) } existing.flagsSet.add( args.sourceRecordId === manualId ? "merged-duplicate-source" : "merged-source" ) return existing } const created: ManualAccumulator = { manualId, title: humanizeTitle(stripExtension(manual?.filename || args.filename)), manufacturer, manufacturerCanonical: normalizeManufacturer(manufacturer), model: model || null, manualType, category: args.categoryHint, manualUrl: manual ? getManualUrl(manual) : null, thumbnailUrl: manual ? getThumbnailUrl(manual) : null, sourceFilenames: Array.from( new Set([ args.filename, ...(args.sourceRecordId.toLowerCase().endsWith(".json") ? [args.sourceRecordId] : []), ]) ), sourceRecordCount: 1, metadataConfidenceTotal: args.metadataConfidence, metadataConfidenceSamples: 1, flagsSet: new Set( args.catalogMatch ? [] : ["catalog-match-missing"] ), hasStructured: false, hasTroubleshooting: false, hasOcrText: false, chunks: [], } if (!args.catalogMatch && isPlaceholderValue(args.recordManufacturer || "")) { created.flagsSet.add("metadata-manufacturer-placeholder") } args.manuals.set(manualId, created) return created } function buildStructuredChunks( record: StructuredManualRecord, manual: ManualAccumulator ) { const chunks: ManualsQdrantChunk[] = [] for (const section of record.content?.sections || []) { const text = cleanText(section.text || "") if (!text) { continue } chunks.push( createChunk({ manual, text, pageNumber: section.pageNumber ?? null, sectionTitle: cleanText(section.title || "") || null, sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, sourceKind: "structured_section", }) ) } for (const item of record.content?.troubleshooting || []) { const problem = cleanText(item.problem || "") const solution = cleanText(item.solution || "") const text = cleanText( [ problem ? `Problem: ${problem}` : "", solution ? `Likely cause or solution: ${solution}` : "", ] .filter(Boolean) .join("\n") ) if (!text) { continue } chunks.push( createChunk({ manual, text, pageNumber: item.pageNumber ?? null, sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting", sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, sourceKind: "troubleshooting", }) ) } const partsByPage = new Map() for (const item of record.content?.partsDatabase || []) { const partNumber = cleanText(item.partNumber || "") const description = cleanText(item.description || "") if (partNumber.length < 2 && description.length < 4) { continue } const pageNumber = item.pageNumber ?? 0 const parts = partsByPage.get(pageNumber) || [] parts.push(description ? `Part ${partNumber}: ${description}` : `Part ${partNumber}`) partsByPage.set(pageNumber, parts) } for (const [pageNumber, parts] of partsByPage.entries()) { chunks.push( createChunk({ manual, text: parts.slice(0, 12).join("\n"), pageNumber: pageNumber || null, sectionTitle: "Parts reference", sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, sourceKind: "parts_database", }) ) } if (record.content?.specifications) { const specsText = cleanText( Object.entries(record.content.specifications) .map(([key, value]) => `${humanizeTitle(key)}: ${String(value)}`) .join("\n") ) if (specsText) { chunks.push( createChunk({ manual, text: specsText, pageNumber: null, sectionTitle: "Specifications", sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null, sourceKind: "structured_section", }) ) } } return chunks } function buildExtractedChunks( record: ExtractedManualRecord, manual: ManualAccumulator ) { const chunks: ManualsQdrantChunk[] = [] for (const page of record.text?.pages || []) { const text = cleanText(page.text || "") if (!text || (page.wordCount || 0) === 0) { continue } chunks.push( createChunk({ manual, text, pageNumber: page.pageNumber ?? null, sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "OCR page", sourceFilename: record.filename || null, sourceKind: "ocr_page", }) ) } for (const list of record.partsLists || []) { const parts = (list.parts || []) .map((part) => { const partNumber = cleanText(part.partNumber || "") const description = cleanText(part.description || "") return description ? `Part ${partNumber}: ${description}` : partNumber ? `Part ${partNumber}` : "" }) .filter(Boolean) .slice(0, 12) if (parts.length === 0) { continue } chunks.push( createChunk({ manual, text: parts.join("\n"), pageNumber: list.pageNumber ?? null, sectionTitle: "Parts reference", sourceFilename: record.filename || null, sourceKind: "parts_database", }) ) } return chunks } function addChunkToManual( manual: ManualAccumulator, chunk: ManualsQdrantChunk, chunkDedup: Set ) { const dedupKey = [ chunk.manualId, chunk.pageNumber ?? "na", normalizeIdentifier(chunk.sectionTitle || ""), normalizeIdentifier(chunk.text.slice(0, 180)), ].join("::") if (chunkDedup.has(dedupKey)) { manual.flagsSet.add("duplicate-chunk-collapsed") return } chunkDedup.add(dedupKey) manual.chunks.push(chunk) } function finalizeManual(manual: ManualAccumulator): ManualsQdrantManual & { chunks: ManualsQdrantChunk[] } { const metadataConfidence = clamp( manual.metadataConfidenceTotal / manual.metadataConfidenceSamples ) const duplicateRisk = clamp((manual.sourceRecordCount - 1) / 4) const highConfidenceChunkCount = manual.chunks.filter( (chunk) => chunk.embeddingTier === "high_confidence" ).length const parseQuality = clamp( metadataConfidence * 0.4 + (manual.hasStructured ? 0.2 : 0) + (manual.hasTroubleshooting ? 0.15 : 0) + (manual.hasOcrText ? 0.1 : 0) + clamp(highConfidenceChunkCount / 8) * 0.25 - duplicateRisk * 0.15 ) const embeddingTier: ManualsEmbeddingTier = parseQuality >= 0.72 && highConfidenceChunkCount > 0 ? "high_confidence" : parseQuality >= 0.46 && manual.chunks.length > 0 ? "fallback" : "exclude" const profiles = buildProfiles({ labels: Array.from(new Set(manual.chunks.flatMap((chunk) => chunk.labels))), embeddingTier, overallQuality: parseQuality, isRisky: manual.chunks.some((chunk) => chunk.isRisky), }) const finalizedChunks = manual.chunks.map((chunk) => { return { ...chunk, manufacturer: manual.manufacturer, manufacturerCanonical: manual.manufacturerCanonical, model: manual.model, manualType: manual.manualType, category: manual.category, manualUrl: manual.manualUrl, thumbnailUrl: manual.thumbnailUrl, profiles: buildProfiles({ labels: chunk.labels, embeddingTier: chunk.embeddingTier, overallQuality: chunk.overallQuality, isRisky: chunk.isRisky, }), } }) return { manualId: manual.manualId, title: manual.title, manufacturer: manual.manufacturer, manufacturerCanonical: manual.manufacturerCanonical, model: manual.model, manualType: manual.manualType, category: manual.category, manualUrl: manual.manualUrl, thumbnailUrl: manual.thumbnailUrl, sourceFilenames: Array.from(new Set(manual.sourceFilenames)).sort(), sourceRecordCount: manual.sourceRecordCount, metadataConfidence, parseQuality, duplicateRisk, chunkCount: finalizedChunks.length, highConfidenceChunkCount, profiles, embeddingTier, flags: Array.from(manual.flagsSet).sort(), chunks: finalizedChunks, } } function buildCorpusStats(args: { catalogManuals: CatalogManual[] structuredRecords: StructuredManualRecord[] extractedRecords: ExtractedManualRecord[] manuals: Array chunks: ManualsQdrantChunk[] }): ManualsQdrantCorpusStats { const manualsByManufacturer: Record = {} const chunksByLabel: Record = {} const profileCounts: Record = { public_safe: 0, internal_tech: 0, } for (const manual of args.manuals) { manualsByManufacturer[manual.manufacturer] = (manualsByManufacturer[manual.manufacturer] || 0) + 1 } for (const chunk of args.chunks) { for (const label of chunk.labels) { chunksByLabel[label] = (chunksByLabel[label] || 0) + 1 } for (const profile of chunk.profiles) { profileCounts[profile] += 1 } } return { catalogManuals: args.catalogManuals.length, structuredRecords: args.structuredRecords.length, extractedRecords: args.extractedRecords.length, normalizedManuals: args.manuals.length, chunkCount: args.chunks.length, highConfidenceChunks: args.chunks.filter( (chunk) => chunk.embeddingTier === "high_confidence" ).length, fallbackChunks: args.chunks.filter( (chunk) => chunk.embeddingTier === "fallback" ).length, excludedChunks: args.chunks.filter( (chunk) => chunk.embeddingTier === "exclude" ).length, manualsByManufacturer, chunksByLabel, profileCounts, } } function createChunk(args: { manual: ManualAccumulator text: string pageNumber: number | null sectionTitle: string | null sourceFilename: string | null sourceKind: ManualsQdrantChunk["sourceKind"] }): ManualsQdrantChunk { const cleanedText = cleanText(args.text) const labels = deriveChunkLabels({ text: cleanedText, sectionTitle: args.sectionTitle, sourceKind: args.sourceKind, manualType: args.manual.manualType, }) const metadataConfidence = clamp( args.manual.metadataConfidenceTotal / args.manual.metadataConfidenceSamples ) const textQuality = scoreTextQuality(cleanedText, labels) const overallQuality = clamp(textQuality * 0.65 + metadataConfidence * 0.35) const isRisky = labels.includes("wiring") || TECH_RISK_KEYWORDS.some((keyword) => normalizeText(cleanedText).includes(normalizeText(keyword)) ) const embeddingTier = deriveEmbeddingTier({ labels, overallQuality, sourceKind: args.sourceKind, isRisky, }) return { chunkId: normalizeIdentifier( `${args.manual.manualId} ${args.pageNumber ?? "na"} ${args.sectionTitle || ""} ${cleanedText.slice(0, 80)}` ), manualId: args.manual.manualId, title: args.manual.title, manufacturer: args.manual.manufacturer, manufacturerCanonical: args.manual.manufacturerCanonical, model: args.manual.model, manualType: args.manual.manualType, category: args.manual.category, pageNumber: args.pageNumber, sectionTitle: args.sectionTitle, text: cleanedText, sourceFilename: args.sourceFilename, sourceKind: args.sourceKind, labels, manualUrl: args.manual.manualUrl, thumbnailUrl: args.manual.thumbnailUrl, metadataConfidence, textQuality, overallQuality, embeddingTier, profiles: buildProfiles({ labels, embeddingTier, overallQuality, isRisky, }), isRisky, flags: buildChunkFlags(cleanedText, labels, overallQuality), } } function scoreChunkForQuery( chunk: ManualsQdrantChunk, queryTokens: string[], queryLower: string ) { const chunkText = normalizeText( [ chunk.title, chunk.manufacturer, chunk.model, chunk.sectionTitle, chunk.text, ...chunk.labels, ] .filter(Boolean) .join(" ") ) const chunkTokens = new Set(tokenize(chunkText)) let score = chunk.overallQuality * 10 for (const token of queryTokens) { if (chunkTokens.has(token)) { score += 3.5 } else if (token.length >= 4 && chunkText.includes(token)) { score += 1 } } if ( (queryLower.includes("error") || queryLower.includes("not ") || queryLower.includes("wont") || queryLower.includes("won t")) && chunk.labels.includes("troubleshooting") ) { score += 10 } if ( (queryLower.includes("parts") || queryLower.includes("part") || queryLower.includes("coin") || queryLower.includes("bill")) && chunk.labels.includes("parts") ) { score += 7 } if ( (queryLower.includes("manual") || queryLower.includes("service")) && chunk.labels.includes("service") ) { score += 5 } if (queryLower.includes("wiring") && chunk.labels.includes("wiring")) { score += 6 } if (chunk.labels.includes("brochure")) { score -= 5 } if (chunk.labels.includes("toc") || chunk.labels.includes("flowchart")) { score -= 8 } return score } function deriveChunkLabels(args: { text: string sectionTitle: string | null sourceKind: ManualsQdrantChunk["sourceKind"] manualType: string }): ManualsQdrantChunkLabel[] { const labels = new Set() const haystack = normalizeText( [args.sectionTitle, args.text, args.manualType].filter(Boolean).join(" ") ) if ( args.sourceKind === "troubleshooting" || TROUBLESHOOTING_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)) ) ) { labels.add("troubleshooting") labels.add("service") } if ( args.sourceKind === "parts_database" || PARTS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword))) ) { labels.add("parts") } if ( args.manualType === "operator" || OPERATOR_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)) ) ) { labels.add("operator") } if ( args.manualType === "service" || haystack.includes("technical manual") || haystack.includes("repair") ) { labels.add("service") } if (SPECS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))) { labels.add("specs") } if ( haystack.includes("table of contents") || haystack.includes("list of figures") || haystack.startsWith("contents") ) { labels.add("toc") } if ( haystack.includes("flow chart") || haystack.includes("flowchart") || looksLikeFlowchart(args.text) ) { labels.add("flowchart") } if ( haystack.includes("wiring") || haystack.includes("electrical") || haystack.includes("schematic") || haystack.includes("voltage") ) { labels.add("wiring") } if ( args.manualType === "brochure" || MARKETING_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword))) ) { labels.add("brochure") } if (labels.size === 0) { labels.add("general") } return Array.from(labels).sort() } function deriveEmbeddingTier(args: { labels: ManualsQdrantChunkLabel[] overallQuality: number sourceKind: ManualsQdrantChunk["sourceKind"] isRisky: boolean }): ManualsEmbeddingTier { if ( args.overallQuality < 0.34 || args.labels.includes("toc") || args.labels.includes("flowchart") ) { return "exclude" } if (args.labels.includes("brochure")) { return args.overallQuality >= 0.62 ? "fallback" : "exclude" } if ( args.sourceKind === "ocr_page" && args.overallQuality < 0.58 && !args.labels.includes("troubleshooting") ) { return "fallback" } if (args.isRisky && args.overallQuality < 0.7) { return "fallback" } return args.overallQuality >= 0.64 ? "high_confidence" : "fallback" } function buildProfiles(args: { labels: ManualsQdrantChunkLabel[] embeddingTier: ManualsEmbeddingTier overallQuality: number isRisky: boolean }): ManualsQdrantProfile[] { if (args.embeddingTier === "exclude") { return [] } const profiles = new Set() if (!args.labels.includes("brochure") && !args.labels.includes("toc")) { profiles.add("internal_tech") } if ( !args.isRisky && args.overallQuality >= 0.56 && !args.labels.includes("brochure") && !args.labels.includes("flowchart") && !args.labels.includes("toc") && !args.labels.includes("wiring") ) { profiles.add("public_safe") } return Array.from(profiles).sort() } function buildChunkFlags( text: string, labels: ManualsQdrantChunkLabel[], overallQuality: number ) { const flags = new Set() if (overallQuality < 0.5) { flags.add("low-quality") } if (labels.includes("brochure")) { flags.add("marketing-heavy") } if (labels.includes("wiring")) { flags.add("risky-technical") } if (looksLikeOcrGarbage(text)) { flags.add("ocr-noisy") } return Array.from(flags).sort() } function scoreTextQuality( text: string, labels: ManualsQdrantChunkLabel[] ) { const alphaChars = text.replace(/[^a-z]/gi, "").length const allChars = text.replace(/\s+/g, "").length || 1 const alphaRatio = alphaChars / allChars const tokenCount = tokenize(text).length const uppercaseBursts = (text.match(/\b[A-Z]{4,}\b/g) || []).length const sentenceLike = (text.match(/[.!?]/g) || []).length let score = clamp(alphaRatio) * 0.35 + clamp(tokenCount / 120) * 0.3 + clamp(sentenceLike / 8) * 0.15 + (looksLikeOcrGarbage(text) ? 0 : 0.2) if (labels.includes("troubleshooting")) { score += 0.12 } if (labels.includes("brochure")) { score -= 0.1 } if (uppercaseBursts > 18) { score -= 0.12 } return clamp(score) } function detectManualType(value: string) { const normalized = normalizeText(value) if ( normalized.includes("brochure") || normalized.includes("product notice") || normalized.includes("warranty") ) { return "brochure" } if (normalized.includes("parts")) { return "parts" } if (normalized.includes("operator") || normalized.includes("user guide")) { return "operator" } if ( normalized.includes("service") || normalized.includes("repair") || normalized.includes("technical") ) { return "service" } return "manual" } function hasUsefulOcrText(record: ExtractedManualRecord) { const words = (record.text?.pages || []).reduce( (sum, page) => sum + (page.wordCount || 0), 0 ) return words > 0 } function looksLikeFlowchart(text: string) { const normalized = text.replace(/\s+/g, " ").trim() return ( normalized.includes("* # #") || normalized.includes("press selection number") || normalized.split("\n").filter((line) => /^[*#A-Z0-9 ()/-]+$/.test(line.trim())) .length > 8 ) } function looksLikeOcrGarbage(text: string) { const normalized = text.replace(/\s+/g, " ").trim() const weirdChars = (normalized.match(/[^\x20-\x7E\n\r\t]/g) || []).length const singleLetterBursts = (normalized.match(/\b[A-Z](?:\s+[A-Z]){4,}\b/g) || []) .length return weirdChars > 6 || singleLetterBursts > 0 } function extractModel(value: string) { const matches = normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9-]*\b/g) || [] return matches[0] ? sanitizeModel(matches[0]) : null } function sanitizeModel(value: string) { const normalized = normalizeIdentifier(value).replace(/^unknown-?/, "") return normalized || null } function normalizeManufacturer(value: string | null | undefined): string { const normalized = normalizeText(value || "") for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) { if ( canonical.toLowerCase() === normalized || aliases.some((alias) => normalized.includes(normalizeText(alias))) ) { return canonical } } if (!normalized || isPlaceholderValue(normalized) || /^\d/.test(normalized)) { return "Other" } return toTitleCase(normalized) } function humanizeManufacturer(value: string): string { return normalizeManufacturer(value) } function humanizeTitle(value: string) { return value .replace(/[-_]+/g, " ") .replace(/\s+/g, " ") .trim() } function isPlaceholderValue(value: string) { const normalized = normalizeText(value) return ( !normalized || normalized === "unknown" || normalized === "manual" || /^\d+$/.test(normalized) ) } function cleanText(value: string) { return value.replace(/\u00ad/g, "").replace(/\s+/g, " ").trim() } function normalizeText(value: string) { return value .normalize("NFKD") .replace(/[^a-zA-Z0-9]+/g, " ") .toLowerCase() .trim() } function tokenize(value: string) { return normalizeText(value) .split(" ") .map((token) => token.trim()) .filter( (token) => token.length > 1 && !STOPWORDS.has(token) && !/^\d+$/.test(token) ) } function normalizeIdentifier(value: string) { return normalizeText(stripExtension(value)).replace(/\s+/g, "-") } function stripExtension(value: string) { return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "") } function getStructuredRecordMatchKey(record: StructuredManualRecord) { return [ normalizeIdentifier(record.manualId || ""), normalizeManufacturer(record.manufacturer), sanitizeModel(record.model || "") || "unknown", detectManualType(record.manualType || ""), ].join("::") } function buildCanonicalManualId(args: { catalogManual: Manual | null manufacturer: string model: string | null manualType: string filename: string }) { if (args.catalogManual) { return normalizeIdentifier(args.catalogManual.path || args.catalogManual.filename) } const normalizedManufacturer = normalizeManufacturer(args.manufacturer) const hasReliableIdentity = normalizedManufacturer !== "Other" || Boolean(args.model) if (hasReliableIdentity) { return normalizeIdentifier( `${normalizedManufacturer} ${args.model || "unknown"} ${args.manualType}` ) } return normalizeIdentifier(`${args.filename} ${args.manualType}`) } function toTitleCase(value: string) { return value .split(" ") .filter(Boolean) .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) .join(" ") } function clamp(value: number) { return Math.max(0, Math.min(1, value)) } async function readJsonFile(path: string) { return JSON.parse(await readFile(path, "utf8")) as T }