Rocky_Mountain_Vending/lib/manuals-qdrant-corpus.ts

1734 lines
44 KiB
TypeScript

import { mkdir, readFile, readdir, writeFile } from "node:fs/promises"
import { basename, join } from "node:path"
import { scanManuals } from "@/lib/manuals"
import { getManualsDataRoot } from "@/lib/manuals-paths"
import type { Manual } from "@/lib/manuals-types"
import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types"
const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"]
const STRUCTURED_MANUALS_DIR = [
...MANUALS_OPTIMIZED_ROOT,
"training_data",
"structured",
]
const STRUCTURED_MANUALS_INDEX_FILE = [
...MANUALS_OPTIMIZED_ROOT,
"training_data",
"all_structured_data.json",
]
const EXTRACTED_CONTENT_FILE = [
...MANUALS_OPTIMIZED_ROOT,
"extracted_content",
"manuals_extracted_content.json",
]
const STOPWORDS = new Set([
"a",
"an",
"and",
"are",
"at",
"be",
"by",
"for",
"from",
"in",
"is",
"it",
"manual",
"machine",
"of",
"on",
"or",
"service",
"the",
"to",
"with",
])
const TECH_RISK_KEYWORDS = [
"wiring",
"diagram",
"schematic",
"electrical",
"voltage",
"jumper",
"compressor",
"refrigerant",
"bypass",
"board level",
]
const MARKETING_KEYWORDS = [
"increase sales",
"more profits",
"contact us",
"operator can double up",
"your employees",
"productivity",
"variety",
"brochure",
]
const SPECS_KEYWORDS = [
"dimensions",
"height:",
"width:",
"depth:",
"shipping weight",
"electrical:",
"listings:",
"capacity",
"voltage",
]
const TROUBLESHOOTING_KEYWORDS = [
"probable cause",
"solution",
"troubleshooting",
"not accepting",
"will not vend",
"check fuse",
"error code",
]
const OPERATOR_KEYWORDS = [
"user guide",
"operators guide",
"operation",
"programming",
"setup guide",
"how to",
]
const PARTS_KEYWORDS = [
"parts manual",
"parts reference",
"part number",
"parts list",
"exploded view",
]
const MANUFACTURER_ALIASES: Record<string, string[]> = {
"AP": [
"ap",
"automatic products",
"automatic-products",
"snackshop",
],
"Other": ["other", "unknown", "bill mechs", "coin mechs"],
"Coinco": ["coinco"],
"Crane": [
"crane",
"national vendors",
"national",
"merchant",
"merchant series",
],
"Dixie-Narco": ["dixie", "narco", "dixie narco", "dixie-narco", "bevmax"],
"GPL": ["gpl", "general products"],
"MEI Mars": ["mei", "mars", "bill validator"],
"Royal Vendors": ["royal", "royal vendors", "royal vendor", "rvv"],
"Rowe": ["rowe"],
"Seaga": ["seaga"],
"USI": ["usi", "u select it", "u-select-it", "uselectit"],
"Vendo": ["vendo", "sanden"],
}
export type ManualsQdrantProfile = "public_safe" | "internal_tech"
export type ManualsQdrantChunkLabel =
| "brochure"
| "flowchart"
| "operator"
| "parts"
| "service"
| "specs"
| "toc"
| "troubleshooting"
| "wiring"
| "general"
export type ManualsEmbeddingTier =
| "high_confidence"
| "fallback"
| "exclude"
export type ManualsQdrantManual = {
manualId: string
title: string
manufacturer: string
manufacturerCanonical: string
model: string | null
manualType: string
category: string | null
manualUrl: string | null
thumbnailUrl: string | null
sourceFilenames: string[]
sourceRecordCount: number
metadataConfidence: number
parseQuality: number
duplicateRisk: number
chunkCount: number
highConfidenceChunkCount: number
profiles: ManualsQdrantProfile[]
embeddingTier: ManualsEmbeddingTier
flags: string[]
}
export type ManualsQdrantChunk = {
chunkId: string
manualId: string
title: string
manufacturer: string
manufacturerCanonical: string
model: string | null
manualType: string
category: string | null
pageNumber: number | null
sectionTitle: string | null
text: string
sourceFilename: string | null
sourceKind: "ocr_page" | "parts_database" | "structured_section" | "troubleshooting"
labels: ManualsQdrantChunkLabel[]
manualUrl: string | null
thumbnailUrl: string | null
metadataConfidence: number
textQuality: number
overallQuality: number
embeddingTier: ManualsEmbeddingTier
profiles: ManualsQdrantProfile[]
isRisky: boolean
flags: string[]
}
export type ManualsQdrantCorpusStats = {
catalogManuals: number
structuredRecords: number
extractedRecords: number
normalizedManuals: number
chunkCount: number
highConfidenceChunks: number
fallbackChunks: number
excludedChunks: number
manualsByManufacturer: Record<string, number>
chunksByLabel: Record<string, number>
profileCounts: Record<ManualsQdrantProfile, number>
}
export type ManualsQdrantCorpus = {
generatedAt: string
stats: ManualsQdrantCorpusStats
manuals: ManualsQdrantManual[]
chunks: ManualsQdrantChunk[]
}
export type ManualsQdrantEvaluationCase = {
id: string
query: string
profile: ManualsQdrantProfile
expectedManufacturer?: string
expectedChunkLabels?: ManualsQdrantChunkLabel[]
disallowedChunkLabels?: ManualsQdrantChunkLabel[]
}
export type ManualsQdrantSearchResult = {
chunk: ManualsQdrantChunk
score: number
}
export type ManualsQdrantEvaluationResult = {
cases: Array<{
id: string
query: string
profile: ManualsQdrantProfile
passedTop3Manufacturer: boolean | null
passedTop5Label: boolean
passedDisallowedCheck: boolean
topManufacturers: string[]
topLabels: ManualsQdrantChunkLabel[]
}>
summary: {
totalCases: number
top3ManufacturerPasses: number
labelPasses: number
disallowedPasses: number
}
}
type StructuredSection = {
title?: string
pageNumber?: number
text?: string
}
type StructuredTroubleshooting = {
problem?: string
solution?: string
pageNumber?: number
}
type StructuredPart = {
partNumber?: string
description?: string
pageNumber?: number
}
type StructuredManualRecord = {
manualId?: string
manufacturer?: string
model?: string
manualType?: string
sourceFilename?: string
metadata?: {
pageCount?: number
}
content?: {
sections?: StructuredSection[]
troubleshooting?: StructuredTroubleshooting[]
partsDatabase?: StructuredPart[]
specifications?: Record<string, unknown>
}
}
type ExtractedPage = {
pageNumber?: number
text?: string
wordCount?: number
}
type ExtractedPartList = {
pageNumber?: number
parts?: StructuredPart[]
}
type ExtractedManualRecord = {
filename?: string
filepath?: string
text?: {
fullText?: string
pages?: ExtractedPage[]
}
partsLists?: ExtractedPartList[]
sections?: StructuredSection[]
}
type CatalogManual = {
manual: Manual
manufacturerCanonical: string
modelGuess: string | null
searchText: string
tokenSet: Set<string>
}
type ManualAccumulator = Omit<
ManualsQdrantManual,
| "metadataConfidence"
| "parseQuality"
| "duplicateRisk"
| "chunkCount"
| "highConfidenceChunkCount"
| "profiles"
| "embeddingTier"
| "flags"
> & {
metadataConfidenceTotal: number
metadataConfidenceSamples: number
hasStructured: boolean
hasTroubleshooting: boolean
hasOcrText: boolean
chunks: ManualsQdrantChunk[]
flagsSet: Set<string>
}
const DEFAULT_EVAL_CASES: ManualsQdrantEvaluationCase[] = [
{
id: "rvv-660-service",
query: "RVV 660 service manual",
profile: "internal_tech",
expectedManufacturer: "Royal Vendors",
},
{
id: "narco-bevmax-cooling",
query: "Narco bevmax not cooling",
profile: "public_safe",
expectedManufacturer: "Dixie-Narco",
expectedChunkLabels: ["service", "troubleshooting"],
},
{
id: "coin-mech-dollars",
query: "coin mech not accepting dollars",
profile: "public_safe",
expectedChunkLabels: ["troubleshooting", "parts"],
disallowedChunkLabels: ["brochure"],
},
{
id: "royal-coins",
query: "Royal machine not accepting coins",
profile: "public_safe",
expectedManufacturer: "Royal Vendors",
expectedChunkLabels: ["troubleshooting"],
},
{
id: "wiring-risky",
query: "Royal wiring diagram voltage issue",
profile: "public_safe",
expectedManufacturer: "Royal Vendors",
disallowedChunkLabels: ["wiring"],
},
{
id: "ambiguous-bad-query",
query: "manual for strange mystery vendor",
profile: "public_safe",
disallowedChunkLabels: ["brochure"],
},
]
export function getDefaultManualsQdrantEvaluationCases() {
return DEFAULT_EVAL_CASES
}
let manualsQdrantCorpusPromise: Promise<ManualsQdrantCorpus> | null = null
export function getManualsQdrantCorpus() {
if (!manualsQdrantCorpusPromise) {
manualsQdrantCorpusPromise = buildManualsQdrantCorpus()
}
return manualsQdrantCorpusPromise
}
export function resetManualsQdrantCorpusCache() {
manualsQdrantCorpusPromise = null
}
export async function buildManualsQdrantCorpus(): Promise<ManualsQdrantCorpus> {
const catalogManuals = await loadCatalogManuals()
const structuredRecords = await loadStructuredRecords()
const extractedRecords = await loadExtractedRecords()
const manuals = new Map<string, ManualAccumulator>()
const chunkDedup = new Set<string>()
for (const record of structuredRecords) {
const catalogMatch = matchCatalogManual(
[record.manualId, record.manufacturer, record.model]
.filter(Boolean)
.join(" "),
catalogManuals,
{
manufacturerHint: record.manufacturer || null,
modelHint: record.model || null,
}
)
const filenameHint =
catalogMatch?.manual.filename || record.sourceFilename || `${record.manualId || "manual"}.pdf`
const manual = getOrCreateManualAccumulator({
manuals,
catalogMatch,
filename: filenameHint,
recordManufacturer: record.manufacturer || null,
recordModel: record.model || null,
manualTypeHint: detectManualType(
`${record.manualType || ""} ${record.manualId || ""}`
),
categoryHint: catalogMatch?.manual.category || null,
metadataConfidence: catalogMatch ? 0.86 : 0.32,
sourceRecordId: record.sourceFilename || record.manualId || filenameHint,
})
manual.hasStructured = true
manual.hasTroubleshooting ||= Boolean(record.content?.troubleshooting?.length)
for (const chunk of buildStructuredChunks(record, manual)) {
addChunkToManual(manual, chunk, chunkDedup)
}
}
for (const record of extractedRecords) {
const filename = record.filename || basename(record.filepath || "manual.pdf")
const catalogMatch = matchCatalogManual(filename, catalogManuals)
const manual = getOrCreateManualAccumulator({
manuals,
catalogMatch,
filename,
recordManufacturer: null,
recordModel: null,
manualTypeHint: detectManualType(filename),
categoryHint: catalogMatch?.manual.category || null,
metadataConfidence:
catalogMatch && normalizeIdentifier(catalogMatch.manual.filename) ===
normalizeIdentifier(filename)
? 0.96
: catalogMatch
? 0.78
: 0.36,
sourceRecordId: record.filename || record.filepath || "unknown-extracted",
})
manual.hasOcrText ||= hasUsefulOcrText(record)
// Prefer structured/manual chunks where they exist and use OCR pages only
// as a fallback corpus for manuals we could not parse structurally.
if (manual.hasStructured) {
continue
}
for (const chunk of buildExtractedChunks(record, manual)) {
addChunkToManual(manual, chunk, chunkDedup)
}
}
const finalizedManuals = Array.from(manuals.values())
.map(finalizeManual)
.sort((left, right) => left.manualId.localeCompare(right.manualId))
const finalizedChunks = finalizedManuals
.flatMap((manual) => manual.chunks)
.sort((left, right) => left.chunkId.localeCompare(right.chunkId))
const stats = buildCorpusStats({
catalogManuals,
structuredRecords,
extractedRecords,
manuals: finalizedManuals,
chunks: finalizedChunks,
})
return {
generatedAt: new Date().toISOString(),
stats,
manuals: finalizedManuals,
chunks: finalizedChunks,
}
}
export function searchManualsQdrantCorpus(
corpus: ManualsQdrantCorpus,
query: string,
options?: {
limit?: number
profile?: ManualsQdrantProfile
}
): ManualsQdrantSearchResult[] {
const limit = options?.limit ?? 5
const profile = options?.profile ?? "internal_tech"
const normalizedQuery = normalizeText(query)
const queryTokens = tokenize(normalizedQuery)
const queryLower = normalizedQuery.toLowerCase()
return corpus.chunks
.filter((chunk) => {
return (
chunk.embeddingTier !== "exclude" &&
chunk.profiles.includes(profile) &&
chunk.text.trim().length > 0
)
})
.map((chunk) => ({
chunk,
score: scoreChunkForQuery(chunk, queryTokens, queryLower),
}))
.filter((entry) => entry.score > 0)
.sort((left, right) => right.score - left.score)
.slice(0, limit)
}
export function evaluateManualsQdrantCorpus(
corpus: ManualsQdrantCorpus,
cases: ManualsQdrantEvaluationCase[] = DEFAULT_EVAL_CASES
): ManualsQdrantEvaluationResult {
const results = cases.map((evaluationCase) => {
const topResults = searchManualsQdrantCorpus(corpus, evaluationCase.query, {
limit: 5,
profile: evaluationCase.profile,
})
const topManufacturers = Array.from(
new Set(topResults.map((result) => result.chunk.manufacturer))
)
const topLabels = Array.from(
new Set(topResults.flatMap((result) => result.chunk.labels))
)
return {
id: evaluationCase.id,
query: evaluationCase.query,
profile: evaluationCase.profile,
passedTop3Manufacturer: evaluationCase.expectedManufacturer
? topManufacturers
.slice(0, 3)
.includes(evaluationCase.expectedManufacturer)
: null,
passedTop5Label: evaluationCase.expectedChunkLabels
? evaluationCase.expectedChunkLabels.some((label) =>
topLabels.includes(label)
)
: true,
passedDisallowedCheck: evaluationCase.disallowedChunkLabels
? !topLabels.some((label) =>
evaluationCase.disallowedChunkLabels?.includes(label)
)
: true,
topManufacturers,
topLabels,
}
})
return {
cases: results,
summary: {
totalCases: results.length,
top3ManufacturerPasses: results.filter(
(result) => result.passedTop3Manufacturer !== false
).length,
labelPasses: results.filter((result) => result.passedTop5Label).length,
disallowedPasses: results.filter(
(result) => result.passedDisallowedCheck
).length,
},
}
}
export async function writeManualsQdrantArtifacts(args?: {
outputDir?: string
}) {
const outputDir = args?.outputDir || join(process.cwd(), "output", "manuals-qdrant")
const corpus = await buildManualsQdrantCorpus()
const evaluation = evaluateManualsQdrantCorpus(corpus)
const internalTechChunks = corpus.chunks.filter((chunk) =>
chunk.profiles.includes("internal_tech")
)
const publicSafeChunks = corpus.chunks.filter((chunk) =>
chunk.profiles.includes("public_safe")
)
const highConfidenceChunks = corpus.chunks.filter(
(chunk) => chunk.embeddingTier === "high_confidence"
)
await mkdir(outputDir, { recursive: true })
await writeFile(
join(outputDir, "summary.json"),
JSON.stringify(
{
generatedAt: corpus.generatedAt,
stats: corpus.stats,
evaluation: evaluation.summary,
},
null,
2
)
)
await writeFile(
join(outputDir, "manuals.json"),
JSON.stringify(corpus.manuals, null, 2)
)
await writeFile(
join(outputDir, "chunks.json"),
JSON.stringify(corpus.chunks, null, 2)
)
await writeFile(
join(outputDir, "chunks-internal-tech.json"),
JSON.stringify(internalTechChunks, null, 2)
)
await writeFile(
join(outputDir, "chunks-public-safe.json"),
JSON.stringify(publicSafeChunks, null, 2)
)
await writeFile(
join(outputDir, "chunks-high-confidence.json"),
JSON.stringify(highConfidenceChunks, null, 2)
)
await writeFile(
join(outputDir, "evaluation-cases.json"),
JSON.stringify(DEFAULT_EVAL_CASES, null, 2)
)
await writeFile(
join(outputDir, "evaluation-report.json"),
JSON.stringify(evaluation, null, 2)
)
return {
outputDir,
corpus,
evaluation,
}
}
async function loadCatalogManuals() {
const manuals = await scanManuals()
const catalog = manuals.map((manual) => {
const title = [
manual.filename,
manual.manufacturer,
manual.category,
...(manual.searchTerms || []),
...(manual.commonNames || []),
]
.filter(Boolean)
.join(" ")
return {
manual,
manufacturerCanonical: normalizeManufacturer(manual.manufacturer),
modelGuess: extractModel(title),
searchText: normalizeText(title),
tokenSet: new Set(tokenize(title)),
} satisfies CatalogManual
})
return catalog
}
async function loadStructuredRecords() {
const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR)
const entries = await readdir(directory, { withFileTypes: true })
const files = entries
.filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json"))
.map((entry) => entry.name)
const records = await Promise.all(
files.map(async (filename) => {
const parsed = await readJsonFile<StructuredManualRecord>(
join(directory, filename)
)
return {
...parsed,
sourceFilename: filename,
manualId: parsed.manualId || stripExtension(filename),
}
})
)
const indexRecords = await readJsonFile<StructuredManualRecord[]>(
join(getManualsDataRoot(), ...STRUCTURED_MANUALS_INDEX_FILE)
)
if (indexRecords.length === 0) {
return records
}
const recordsByKey = new Map<string, string[]>()
for (const record of records) {
const key = getStructuredRecordMatchKey(record)
const existing = recordsByKey.get(key) || []
existing.push(record.sourceFilename || `${record.manualId || "structured"}.json`)
recordsByKey.set(key, existing)
}
return indexRecords.map((record, index) => {
const key = getStructuredRecordMatchKey(record)
const matchingFilenames = recordsByKey.get(key) || []
return {
...record,
sourceFilename:
matchingFilenames.shift() ||
`${normalizeIdentifier(record.manualId || `structured-record-${index + 1}`)}.json`,
}
})
}
async function loadExtractedRecords() {
return await readJsonFile<ExtractedManualRecord[]>(
join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE)
)
}
function matchCatalogManual(
rawQuery: string,
catalogManuals: CatalogManual[],
hints?: {
manufacturerHint?: string | null
modelHint?: string | null
}
) {
const normalizedQuery = normalizeText(rawQuery)
const tokens = tokenize(normalizedQuery)
const manufacturerHint = hints?.manufacturerHint
? normalizeManufacturer(hints.manufacturerHint)
: null
const modelHint = hints?.modelHint ? normalizeIdentifier(hints.modelHint) : null
const exactStemMatch = catalogManuals.find(
(catalogManual) =>
normalizeIdentifier(stripExtension(catalogManual.manual.filename)) ===
normalizeIdentifier(stripExtension(rawQuery))
)
if (exactStemMatch) {
return exactStemMatch
}
const scored = catalogManuals
.map((catalogManual) => {
let score = 0
if (manufacturerHint) {
score +=
catalogManual.manufacturerCanonical === manufacturerHint ? 16 : -4
}
if (modelHint) {
if (catalogManual.modelGuess === modelHint) {
score += 14
} else if (
catalogManual.searchText.includes(modelHint.replace(/-/g, " "))
) {
score += 8
}
}
if (
normalizedQuery &&
catalogManual.searchText.includes(normalizedQuery.toLowerCase())
) {
score += 20
}
for (const token of tokens) {
if (catalogManual.tokenSet.has(token)) {
score += 4
} else if (
token.length >= 4 &&
catalogManual.searchText.includes(token)
) {
score += 1.5
}
}
return { catalogManual, score }
})
.sort((left, right) => right.score - left.score)
return scored[0] && scored[0].score >= 10 ? scored[0].catalogManual : null
}
function getOrCreateManualAccumulator(args: {
manuals: Map<string, ManualAccumulator>
catalogMatch: CatalogManual | null
filename: string
recordManufacturer: string | null
recordModel: string | null
manualTypeHint: string
categoryHint: string | null
metadataConfidence: number
sourceRecordId: string
}) {
const manual = args.catalogMatch?.manual
const manufacturer = humanizeManufacturer(
manual?.manufacturer || args.recordManufacturer || "Other"
)
const model =
args.recordModel && !isPlaceholderValue(args.recordModel)
? sanitizeModel(args.recordModel)
: args.catalogMatch?.modelGuess || extractModel(args.filename)
const manualType = args.manualTypeHint || detectManualType(args.filename)
const manualId = buildCanonicalManualId({
catalogManual: manual || null,
manufacturer,
model,
manualType,
filename: args.filename,
})
const existing = args.manuals.get(manualId)
if (existing) {
existing.sourceFilenames = Array.from(
new Set([
...existing.sourceFilenames,
args.filename,
...(args.sourceRecordId.toLowerCase().endsWith(".json")
? [args.sourceRecordId]
: []),
])
)
existing.sourceRecordCount += 1
existing.metadataConfidenceTotal += args.metadataConfidence
existing.metadataConfidenceSamples += 1
if (args.categoryHint && !existing.category) {
existing.category = args.categoryHint
}
if (args.recordModel && !existing.model && !isPlaceholderValue(args.recordModel)) {
existing.model = sanitizeModel(args.recordModel)
}
if (args.recordManufacturer && existing.manufacturer === "Other") {
existing.manufacturer = humanizeManufacturer(args.recordManufacturer)
existing.manufacturerCanonical = normalizeManufacturer(existing.manufacturer)
}
existing.flagsSet.add(
args.sourceRecordId === manualId ? "merged-duplicate-source" : "merged-source"
)
return existing
}
const created: ManualAccumulator = {
manualId,
title: humanizeTitle(stripExtension(manual?.filename || args.filename)),
manufacturer,
manufacturerCanonical: normalizeManufacturer(manufacturer),
model: model || null,
manualType,
category: args.categoryHint,
manualUrl: manual ? getManualUrl(manual) : null,
thumbnailUrl: manual ? getThumbnailUrl(manual) : null,
sourceFilenames: Array.from(
new Set([
args.filename,
...(args.sourceRecordId.toLowerCase().endsWith(".json")
? [args.sourceRecordId]
: []),
])
),
sourceRecordCount: 1,
metadataConfidenceTotal: args.metadataConfidence,
metadataConfidenceSamples: 1,
flagsSet: new Set(
args.catalogMatch ? [] : ["catalog-match-missing"]
),
hasStructured: false,
hasTroubleshooting: false,
hasOcrText: false,
chunks: [],
}
if (!args.catalogMatch && isPlaceholderValue(args.recordManufacturer || "")) {
created.flagsSet.add("metadata-manufacturer-placeholder")
}
args.manuals.set(manualId, created)
return created
}
function buildStructuredChunks(
record: StructuredManualRecord,
manual: ManualAccumulator
) {
const chunks: ManualsQdrantChunk[] = []
for (const section of record.content?.sections || []) {
const text = cleanText(section.text || "")
if (!text) {
continue
}
chunks.push(
createChunk({
manual,
text,
pageNumber: section.pageNumber ?? null,
sectionTitle: cleanText(section.title || "") || null,
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
sourceKind: "structured_section",
})
)
}
for (const item of record.content?.troubleshooting || []) {
const problem = cleanText(item.problem || "")
const solution = cleanText(item.solution || "")
const text = cleanText(
[
problem ? `Problem: ${problem}` : "",
solution ? `Likely cause or solution: ${solution}` : "",
]
.filter(Boolean)
.join("\n")
)
if (!text) {
continue
}
chunks.push(
createChunk({
manual,
text,
pageNumber: item.pageNumber ?? null,
sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting",
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
sourceKind: "troubleshooting",
})
)
}
const partsByPage = new Map<number, string[]>()
for (const item of record.content?.partsDatabase || []) {
const partNumber = cleanText(item.partNumber || "")
const description = cleanText(item.description || "")
if (partNumber.length < 2 && description.length < 4) {
continue
}
const pageNumber = item.pageNumber ?? 0
const parts = partsByPage.get(pageNumber) || []
parts.push(description ? `Part ${partNumber}: ${description}` : `Part ${partNumber}`)
partsByPage.set(pageNumber, parts)
}
for (const [pageNumber, parts] of partsByPage.entries()) {
chunks.push(
createChunk({
manual,
text: parts.slice(0, 12).join("\n"),
pageNumber: pageNumber || null,
sectionTitle: "Parts reference",
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
sourceKind: "parts_database",
})
)
}
if (record.content?.specifications) {
const specsText = cleanText(
Object.entries(record.content.specifications)
.map(([key, value]) => `${humanizeTitle(key)}: ${String(value)}`)
.join("\n")
)
if (specsText) {
chunks.push(
createChunk({
manual,
text: specsText,
pageNumber: null,
sectionTitle: "Specifications",
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
sourceKind: "structured_section",
})
)
}
}
return chunks
}
function buildExtractedChunks(
record: ExtractedManualRecord,
manual: ManualAccumulator
) {
const chunks: ManualsQdrantChunk[] = []
for (const page of record.text?.pages || []) {
const text = cleanText(page.text || "")
if (!text || (page.wordCount || 0) === 0) {
continue
}
chunks.push(
createChunk({
manual,
text,
pageNumber: page.pageNumber ?? null,
sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "OCR page",
sourceFilename: record.filename || null,
sourceKind: "ocr_page",
})
)
}
for (const list of record.partsLists || []) {
const parts = (list.parts || [])
.map((part) => {
const partNumber = cleanText(part.partNumber || "")
const description = cleanText(part.description || "")
return description
? `Part ${partNumber}: ${description}`
: partNumber
? `Part ${partNumber}`
: ""
})
.filter(Boolean)
.slice(0, 12)
if (parts.length === 0) {
continue
}
chunks.push(
createChunk({
manual,
text: parts.join("\n"),
pageNumber: list.pageNumber ?? null,
sectionTitle: "Parts reference",
sourceFilename: record.filename || null,
sourceKind: "parts_database",
})
)
}
return chunks
}
function addChunkToManual(
manual: ManualAccumulator,
chunk: ManualsQdrantChunk,
chunkDedup: Set<string>
) {
const dedupKey = [
chunk.manualId,
chunk.pageNumber ?? "na",
normalizeIdentifier(chunk.sectionTitle || ""),
normalizeIdentifier(chunk.text.slice(0, 180)),
].join("::")
if (chunkDedup.has(dedupKey)) {
manual.flagsSet.add("duplicate-chunk-collapsed")
return
}
chunkDedup.add(dedupKey)
manual.chunks.push(chunk)
}
function finalizeManual(manual: ManualAccumulator): ManualsQdrantManual & {
chunks: ManualsQdrantChunk[]
} {
const metadataConfidence = clamp(
manual.metadataConfidenceTotal / manual.metadataConfidenceSamples
)
const duplicateRisk = clamp((manual.sourceRecordCount - 1) / 4)
const highConfidenceChunkCount = manual.chunks.filter(
(chunk) => chunk.embeddingTier === "high_confidence"
).length
const parseQuality = clamp(
metadataConfidence * 0.4 +
(manual.hasStructured ? 0.2 : 0) +
(manual.hasTroubleshooting ? 0.15 : 0) +
(manual.hasOcrText ? 0.1 : 0) +
clamp(highConfidenceChunkCount / 8) * 0.25 -
duplicateRisk * 0.15
)
const embeddingTier: ManualsEmbeddingTier =
parseQuality >= 0.72 && highConfidenceChunkCount > 0
? "high_confidence"
: parseQuality >= 0.46 && manual.chunks.length > 0
? "fallback"
: "exclude"
const profiles = buildProfiles({
labels: Array.from(new Set(manual.chunks.flatMap((chunk) => chunk.labels))),
embeddingTier,
overallQuality: parseQuality,
isRisky: manual.chunks.some((chunk) => chunk.isRisky),
})
const finalizedChunks = manual.chunks.map((chunk) => {
return {
...chunk,
manufacturer: manual.manufacturer,
manufacturerCanonical: manual.manufacturerCanonical,
model: manual.model,
manualType: manual.manualType,
category: manual.category,
manualUrl: manual.manualUrl,
thumbnailUrl: manual.thumbnailUrl,
profiles: buildProfiles({
labels: chunk.labels,
embeddingTier: chunk.embeddingTier,
overallQuality: chunk.overallQuality,
isRisky: chunk.isRisky,
}),
}
})
return {
manualId: manual.manualId,
title: manual.title,
manufacturer: manual.manufacturer,
manufacturerCanonical: manual.manufacturerCanonical,
model: manual.model,
manualType: manual.manualType,
category: manual.category,
manualUrl: manual.manualUrl,
thumbnailUrl: manual.thumbnailUrl,
sourceFilenames: Array.from(new Set(manual.sourceFilenames)).sort(),
sourceRecordCount: manual.sourceRecordCount,
metadataConfidence,
parseQuality,
duplicateRisk,
chunkCount: finalizedChunks.length,
highConfidenceChunkCount,
profiles,
embeddingTier,
flags: Array.from(manual.flagsSet).sort(),
chunks: finalizedChunks,
}
}
function buildCorpusStats(args: {
catalogManuals: CatalogManual[]
structuredRecords: StructuredManualRecord[]
extractedRecords: ExtractedManualRecord[]
manuals: Array<ManualsQdrantManual & { chunks: ManualsQdrantChunk[] }>
chunks: ManualsQdrantChunk[]
}): ManualsQdrantCorpusStats {
const manualsByManufacturer: Record<string, number> = {}
const chunksByLabel: Record<string, number> = {}
const profileCounts: Record<ManualsQdrantProfile, number> = {
public_safe: 0,
internal_tech: 0,
}
for (const manual of args.manuals) {
manualsByManufacturer[manual.manufacturer] =
(manualsByManufacturer[manual.manufacturer] || 0) + 1
}
for (const chunk of args.chunks) {
for (const label of chunk.labels) {
chunksByLabel[label] = (chunksByLabel[label] || 0) + 1
}
for (const profile of chunk.profiles) {
profileCounts[profile] += 1
}
}
return {
catalogManuals: args.catalogManuals.length,
structuredRecords: args.structuredRecords.length,
extractedRecords: args.extractedRecords.length,
normalizedManuals: args.manuals.length,
chunkCount: args.chunks.length,
highConfidenceChunks: args.chunks.filter(
(chunk) => chunk.embeddingTier === "high_confidence"
).length,
fallbackChunks: args.chunks.filter(
(chunk) => chunk.embeddingTier === "fallback"
).length,
excludedChunks: args.chunks.filter(
(chunk) => chunk.embeddingTier === "exclude"
).length,
manualsByManufacturer,
chunksByLabel,
profileCounts,
}
}
function createChunk(args: {
manual: ManualAccumulator
text: string
pageNumber: number | null
sectionTitle: string | null
sourceFilename: string | null
sourceKind: ManualsQdrantChunk["sourceKind"]
}): ManualsQdrantChunk {
const cleanedText = cleanText(args.text)
const labels = deriveChunkLabels({
text: cleanedText,
sectionTitle: args.sectionTitle,
sourceKind: args.sourceKind,
manualType: args.manual.manualType,
})
const metadataConfidence = clamp(
args.manual.metadataConfidenceTotal / args.manual.metadataConfidenceSamples
)
const textQuality = scoreTextQuality(cleanedText, labels)
const overallQuality = clamp(textQuality * 0.65 + metadataConfidence * 0.35)
const isRisky =
labels.includes("wiring") ||
TECH_RISK_KEYWORDS.some((keyword) =>
normalizeText(cleanedText).includes(normalizeText(keyword))
)
const embeddingTier = deriveEmbeddingTier({
labels,
overallQuality,
sourceKind: args.sourceKind,
isRisky,
})
return {
chunkId: normalizeIdentifier(
`${args.manual.manualId} ${args.pageNumber ?? "na"} ${args.sectionTitle || ""} ${cleanedText.slice(0, 80)}`
),
manualId: args.manual.manualId,
title: args.manual.title,
manufacturer: args.manual.manufacturer,
manufacturerCanonical: args.manual.manufacturerCanonical,
model: args.manual.model,
manualType: args.manual.manualType,
category: args.manual.category,
pageNumber: args.pageNumber,
sectionTitle: args.sectionTitle,
text: cleanedText,
sourceFilename: args.sourceFilename,
sourceKind: args.sourceKind,
labels,
manualUrl: args.manual.manualUrl,
thumbnailUrl: args.manual.thumbnailUrl,
metadataConfidence,
textQuality,
overallQuality,
embeddingTier,
profiles: buildProfiles({
labels,
embeddingTier,
overallQuality,
isRisky,
}),
isRisky,
flags: buildChunkFlags(cleanedText, labels, overallQuality),
}
}
function scoreChunkForQuery(
chunk: ManualsQdrantChunk,
queryTokens: string[],
queryLower: string
) {
const chunkText = normalizeText(
[
chunk.title,
chunk.manufacturer,
chunk.model,
chunk.sectionTitle,
chunk.text,
...chunk.labels,
]
.filter(Boolean)
.join(" ")
)
const chunkTokens = new Set(tokenize(chunkText))
let score = chunk.overallQuality * 10
for (const token of queryTokens) {
if (chunkTokens.has(token)) {
score += 3.5
} else if (token.length >= 4 && chunkText.includes(token)) {
score += 1
}
}
if (
(queryLower.includes("error") ||
queryLower.includes("not ") ||
queryLower.includes("wont") ||
queryLower.includes("won t")) &&
chunk.labels.includes("troubleshooting")
) {
score += 10
}
if (
(queryLower.includes("parts") ||
queryLower.includes("part") ||
queryLower.includes("coin") ||
queryLower.includes("bill")) &&
chunk.labels.includes("parts")
) {
score += 7
}
if (
(queryLower.includes("manual") || queryLower.includes("service")) &&
chunk.labels.includes("service")
) {
score += 5
}
if (queryLower.includes("wiring") && chunk.labels.includes("wiring")) {
score += 6
}
if (chunk.labels.includes("brochure")) {
score -= 5
}
if (chunk.labels.includes("toc") || chunk.labels.includes("flowchart")) {
score -= 8
}
return score
}
function deriveChunkLabels(args: {
text: string
sectionTitle: string | null
sourceKind: ManualsQdrantChunk["sourceKind"]
manualType: string
}): ManualsQdrantChunkLabel[] {
const labels = new Set<ManualsQdrantChunkLabel>()
const haystack = normalizeText(
[args.sectionTitle, args.text, args.manualType].filter(Boolean).join(" ")
)
if (
args.sourceKind === "troubleshooting" ||
TROUBLESHOOTING_KEYWORDS.some((keyword) =>
haystack.includes(normalizeText(keyword))
)
) {
labels.add("troubleshooting")
labels.add("service")
}
if (
args.sourceKind === "parts_database" ||
PARTS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))
) {
labels.add("parts")
}
if (
args.manualType === "operator" ||
OPERATOR_KEYWORDS.some((keyword) =>
haystack.includes(normalizeText(keyword))
)
) {
labels.add("operator")
}
if (
args.manualType === "service" ||
haystack.includes("technical manual") ||
haystack.includes("repair")
) {
labels.add("service")
}
if (SPECS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))) {
labels.add("specs")
}
if (
haystack.includes("table of contents") ||
haystack.includes("list of figures") ||
haystack.startsWith("contents")
) {
labels.add("toc")
}
if (
haystack.includes("flow chart") ||
haystack.includes("flowchart") ||
looksLikeFlowchart(args.text)
) {
labels.add("flowchart")
}
if (
haystack.includes("wiring") ||
haystack.includes("electrical") ||
haystack.includes("schematic") ||
haystack.includes("voltage")
) {
labels.add("wiring")
}
if (
args.manualType === "brochure" ||
MARKETING_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))
) {
labels.add("brochure")
}
if (labels.size === 0) {
labels.add("general")
}
return Array.from(labels).sort()
}
function deriveEmbeddingTier(args: {
labels: ManualsQdrantChunkLabel[]
overallQuality: number
sourceKind: ManualsQdrantChunk["sourceKind"]
isRisky: boolean
}): ManualsEmbeddingTier {
if (
args.overallQuality < 0.34 ||
args.labels.includes("toc") ||
args.labels.includes("flowchart")
) {
return "exclude"
}
if (args.labels.includes("brochure")) {
return args.overallQuality >= 0.62 ? "fallback" : "exclude"
}
if (
args.sourceKind === "ocr_page" &&
args.overallQuality < 0.58 &&
!args.labels.includes("troubleshooting")
) {
return "fallback"
}
if (args.isRisky && args.overallQuality < 0.7) {
return "fallback"
}
return args.overallQuality >= 0.64 ? "high_confidence" : "fallback"
}
function buildProfiles(args: {
labels: ManualsQdrantChunkLabel[]
embeddingTier: ManualsEmbeddingTier
overallQuality: number
isRisky: boolean
}): ManualsQdrantProfile[] {
if (args.embeddingTier === "exclude") {
return []
}
const profiles = new Set<ManualsQdrantProfile>()
if (!args.labels.includes("brochure") && !args.labels.includes("toc")) {
profiles.add("internal_tech")
}
if (
!args.isRisky &&
args.overallQuality >= 0.56 &&
!args.labels.includes("brochure") &&
!args.labels.includes("flowchart") &&
!args.labels.includes("toc") &&
!args.labels.includes("wiring")
) {
profiles.add("public_safe")
}
return Array.from(profiles).sort()
}
function buildChunkFlags(
text: string,
labels: ManualsQdrantChunkLabel[],
overallQuality: number
) {
const flags = new Set<string>()
if (overallQuality < 0.5) {
flags.add("low-quality")
}
if (labels.includes("brochure")) {
flags.add("marketing-heavy")
}
if (labels.includes("wiring")) {
flags.add("risky-technical")
}
if (looksLikeOcrGarbage(text)) {
flags.add("ocr-noisy")
}
return Array.from(flags).sort()
}
function scoreTextQuality(
text: string,
labels: ManualsQdrantChunkLabel[]
) {
const alphaChars = text.replace(/[^a-z]/gi, "").length
const allChars = text.replace(/\s+/g, "").length || 1
const alphaRatio = alphaChars / allChars
const tokenCount = tokenize(text).length
const uppercaseBursts = (text.match(/\b[A-Z]{4,}\b/g) || []).length
const sentenceLike = (text.match(/[.!?]/g) || []).length
let score =
clamp(alphaRatio) * 0.35 +
clamp(tokenCount / 120) * 0.3 +
clamp(sentenceLike / 8) * 0.15 +
(looksLikeOcrGarbage(text) ? 0 : 0.2)
if (labels.includes("troubleshooting")) {
score += 0.12
}
if (labels.includes("brochure")) {
score -= 0.1
}
if (uppercaseBursts > 18) {
score -= 0.12
}
return clamp(score)
}
function detectManualType(value: string) {
const normalized = normalizeText(value)
if (
normalized.includes("brochure") ||
normalized.includes("product notice") ||
normalized.includes("warranty")
) {
return "brochure"
}
if (normalized.includes("parts")) {
return "parts"
}
if (normalized.includes("operator") || normalized.includes("user guide")) {
return "operator"
}
if (
normalized.includes("service") ||
normalized.includes("repair") ||
normalized.includes("technical")
) {
return "service"
}
return "manual"
}
function hasUsefulOcrText(record: ExtractedManualRecord) {
const words = (record.text?.pages || []).reduce(
(sum, page) => sum + (page.wordCount || 0),
0
)
return words > 0
}
function looksLikeFlowchart(text: string) {
const normalized = text.replace(/\s+/g, " ").trim()
return (
normalized.includes("* # #") ||
normalized.includes("press selection number") ||
normalized.split("\n").filter((line) => /^[*#A-Z0-9 ()/-]+$/.test(line.trim()))
.length > 8
)
}
function looksLikeOcrGarbage(text: string) {
const normalized = text.replace(/\s+/g, " ").trim()
const weirdChars = (normalized.match(/[^\x20-\x7E\n\r\t]/g) || []).length
const singleLetterBursts = (normalized.match(/\b[A-Z](?:\s+[A-Z]){4,}\b/g) || [])
.length
return weirdChars > 6 || singleLetterBursts > 0
}
function extractModel(value: string) {
const matches = normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9-]*\b/g) || []
return matches[0] ? sanitizeModel(matches[0]) : null
}
function sanitizeModel(value: string) {
const normalized = normalizeIdentifier(value).replace(/^unknown-?/, "")
return normalized || null
}
function normalizeManufacturer(value: string | null | undefined): string {
const normalized = normalizeText(value || "")
for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
if (
canonical.toLowerCase() === normalized ||
aliases.some((alias) => normalized.includes(normalizeText(alias)))
) {
return canonical
}
}
if (!normalized || isPlaceholderValue(normalized) || /^\d/.test(normalized)) {
return "Other"
}
return toTitleCase(normalized)
}
function humanizeManufacturer(value: string): string {
return normalizeManufacturer(value)
}
function humanizeTitle(value: string) {
return value
.replace(/[-_]+/g, " ")
.replace(/\s+/g, " ")
.trim()
}
function isPlaceholderValue(value: string) {
const normalized = normalizeText(value)
return (
!normalized ||
normalized === "unknown" ||
normalized === "manual" ||
/^\d+$/.test(normalized)
)
}
function cleanText(value: string) {
return value.replace(/\u00ad/g, "").replace(/\s+/g, " ").trim()
}
function normalizeText(value: string) {
return value
.normalize("NFKD")
.replace(/[^a-zA-Z0-9]+/g, " ")
.toLowerCase()
.trim()
}
function tokenize(value: string) {
return normalizeText(value)
.split(" ")
.map((token) => token.trim())
.filter(
(token) =>
token.length > 1 &&
!STOPWORDS.has(token) &&
!/^\d+$/.test(token)
)
}
function normalizeIdentifier(value: string) {
return normalizeText(stripExtension(value)).replace(/\s+/g, "-")
}
function stripExtension(value: string) {
return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "")
}
function getStructuredRecordMatchKey(record: StructuredManualRecord) {
return [
normalizeIdentifier(record.manualId || ""),
normalizeManufacturer(record.manufacturer),
sanitizeModel(record.model || "") || "unknown",
detectManualType(record.manualType || ""),
].join("::")
}
function buildCanonicalManualId(args: {
catalogManual: Manual | null
manufacturer: string
model: string | null
manualType: string
filename: string
}) {
if (args.catalogManual) {
return normalizeIdentifier(args.catalogManual.path || args.catalogManual.filename)
}
const normalizedManufacturer = normalizeManufacturer(args.manufacturer)
const hasReliableIdentity =
normalizedManufacturer !== "Other" || Boolean(args.model)
if (hasReliableIdentity) {
return normalizeIdentifier(
`${normalizedManufacturer} ${args.model || "unknown"} ${args.manualType}`
)
}
return normalizeIdentifier(`${args.filename} ${args.manualType}`)
}
function toTitleCase(value: string) {
return value
.split(" ")
.filter(Boolean)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ")
}
function clamp(value: number) {
return Math.max(0, Math.min(1, value))
}
async function readJsonFile<T>(path: string) {
return JSON.parse(await readFile(path, "utf8")) as T
}