1734 lines
44 KiB
TypeScript
1734 lines
44 KiB
TypeScript
import { mkdir, readFile, readdir, writeFile } from "node:fs/promises"
|
|
import { basename, join } from "node:path"
|
|
import { scanManuals } from "@/lib/manuals"
|
|
import { getManualsDataRoot } from "@/lib/manuals-paths"
|
|
import type { Manual } from "@/lib/manuals-types"
|
|
import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types"
|
|
|
|
const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"]
|
|
const STRUCTURED_MANUALS_DIR = [
|
|
...MANUALS_OPTIMIZED_ROOT,
|
|
"training_data",
|
|
"structured",
|
|
]
|
|
const STRUCTURED_MANUALS_INDEX_FILE = [
|
|
...MANUALS_OPTIMIZED_ROOT,
|
|
"training_data",
|
|
"all_structured_data.json",
|
|
]
|
|
const EXTRACTED_CONTENT_FILE = [
|
|
...MANUALS_OPTIMIZED_ROOT,
|
|
"extracted_content",
|
|
"manuals_extracted_content.json",
|
|
]
|
|
|
|
const STOPWORDS = new Set([
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"are",
|
|
"at",
|
|
"be",
|
|
"by",
|
|
"for",
|
|
"from",
|
|
"in",
|
|
"is",
|
|
"it",
|
|
"manual",
|
|
"machine",
|
|
"of",
|
|
"on",
|
|
"or",
|
|
"service",
|
|
"the",
|
|
"to",
|
|
"with",
|
|
])
|
|
|
|
const TECH_RISK_KEYWORDS = [
|
|
"wiring",
|
|
"diagram",
|
|
"schematic",
|
|
"electrical",
|
|
"voltage",
|
|
"jumper",
|
|
"compressor",
|
|
"refrigerant",
|
|
"bypass",
|
|
"board level",
|
|
]
|
|
|
|
const MARKETING_KEYWORDS = [
|
|
"increase sales",
|
|
"more profits",
|
|
"contact us",
|
|
"operator can double up",
|
|
"your employees",
|
|
"productivity",
|
|
"variety",
|
|
"brochure",
|
|
]
|
|
|
|
const SPECS_KEYWORDS = [
|
|
"dimensions",
|
|
"height:",
|
|
"width:",
|
|
"depth:",
|
|
"shipping weight",
|
|
"electrical:",
|
|
"listings:",
|
|
"capacity",
|
|
"voltage",
|
|
]
|
|
|
|
const TROUBLESHOOTING_KEYWORDS = [
|
|
"probable cause",
|
|
"solution",
|
|
"troubleshooting",
|
|
"not accepting",
|
|
"will not vend",
|
|
"check fuse",
|
|
"error code",
|
|
]
|
|
|
|
const OPERATOR_KEYWORDS = [
|
|
"user guide",
|
|
"operators guide",
|
|
"operation",
|
|
"programming",
|
|
"setup guide",
|
|
"how to",
|
|
]
|
|
|
|
const PARTS_KEYWORDS = [
|
|
"parts manual",
|
|
"parts reference",
|
|
"part number",
|
|
"parts list",
|
|
"exploded view",
|
|
]
|
|
|
|
const MANUFACTURER_ALIASES: Record<string, string[]> = {
|
|
"AP": [
|
|
"ap",
|
|
"automatic products",
|
|
"automatic-products",
|
|
"snackshop",
|
|
],
|
|
"Other": ["other", "unknown", "bill mechs", "coin mechs"],
|
|
"Coinco": ["coinco"],
|
|
"Crane": [
|
|
"crane",
|
|
"national vendors",
|
|
"national",
|
|
"merchant",
|
|
"merchant series",
|
|
],
|
|
"Dixie-Narco": ["dixie", "narco", "dixie narco", "dixie-narco", "bevmax"],
|
|
"GPL": ["gpl", "general products"],
|
|
"MEI Mars": ["mei", "mars", "bill validator"],
|
|
"Royal Vendors": ["royal", "royal vendors", "royal vendor", "rvv"],
|
|
"Rowe": ["rowe"],
|
|
"Seaga": ["seaga"],
|
|
"USI": ["usi", "u select it", "u-select-it", "uselectit"],
|
|
"Vendo": ["vendo", "sanden"],
|
|
}
|
|
|
|
export type ManualsQdrantProfile = "public_safe" | "internal_tech"
|
|
|
|
export type ManualsQdrantChunkLabel =
|
|
| "brochure"
|
|
| "flowchart"
|
|
| "operator"
|
|
| "parts"
|
|
| "service"
|
|
| "specs"
|
|
| "toc"
|
|
| "troubleshooting"
|
|
| "wiring"
|
|
| "general"
|
|
|
|
export type ManualsEmbeddingTier =
|
|
| "high_confidence"
|
|
| "fallback"
|
|
| "exclude"
|
|
|
|
export type ManualsQdrantManual = {
|
|
manualId: string
|
|
title: string
|
|
manufacturer: string
|
|
manufacturerCanonical: string
|
|
model: string | null
|
|
manualType: string
|
|
category: string | null
|
|
manualUrl: string | null
|
|
thumbnailUrl: string | null
|
|
sourceFilenames: string[]
|
|
sourceRecordCount: number
|
|
metadataConfidence: number
|
|
parseQuality: number
|
|
duplicateRisk: number
|
|
chunkCount: number
|
|
highConfidenceChunkCount: number
|
|
profiles: ManualsQdrantProfile[]
|
|
embeddingTier: ManualsEmbeddingTier
|
|
flags: string[]
|
|
}
|
|
|
|
export type ManualsQdrantChunk = {
|
|
chunkId: string
|
|
manualId: string
|
|
title: string
|
|
manufacturer: string
|
|
manufacturerCanonical: string
|
|
model: string | null
|
|
manualType: string
|
|
category: string | null
|
|
pageNumber: number | null
|
|
sectionTitle: string | null
|
|
text: string
|
|
sourceFilename: string | null
|
|
sourceKind: "ocr_page" | "parts_database" | "structured_section" | "troubleshooting"
|
|
labels: ManualsQdrantChunkLabel[]
|
|
manualUrl: string | null
|
|
thumbnailUrl: string | null
|
|
metadataConfidence: number
|
|
textQuality: number
|
|
overallQuality: number
|
|
embeddingTier: ManualsEmbeddingTier
|
|
profiles: ManualsQdrantProfile[]
|
|
isRisky: boolean
|
|
flags: string[]
|
|
}
|
|
|
|
export type ManualsQdrantCorpusStats = {
|
|
catalogManuals: number
|
|
structuredRecords: number
|
|
extractedRecords: number
|
|
normalizedManuals: number
|
|
chunkCount: number
|
|
highConfidenceChunks: number
|
|
fallbackChunks: number
|
|
excludedChunks: number
|
|
manualsByManufacturer: Record<string, number>
|
|
chunksByLabel: Record<string, number>
|
|
profileCounts: Record<ManualsQdrantProfile, number>
|
|
}
|
|
|
|
export type ManualsQdrantCorpus = {
|
|
generatedAt: string
|
|
stats: ManualsQdrantCorpusStats
|
|
manuals: ManualsQdrantManual[]
|
|
chunks: ManualsQdrantChunk[]
|
|
}
|
|
|
|
export type ManualsQdrantEvaluationCase = {
|
|
id: string
|
|
query: string
|
|
profile: ManualsQdrantProfile
|
|
expectedManufacturer?: string
|
|
expectedChunkLabels?: ManualsQdrantChunkLabel[]
|
|
disallowedChunkLabels?: ManualsQdrantChunkLabel[]
|
|
}
|
|
|
|
export type ManualsQdrantSearchResult = {
|
|
chunk: ManualsQdrantChunk
|
|
score: number
|
|
}
|
|
|
|
export type ManualsQdrantEvaluationResult = {
|
|
cases: Array<{
|
|
id: string
|
|
query: string
|
|
profile: ManualsQdrantProfile
|
|
passedTop3Manufacturer: boolean | null
|
|
passedTop5Label: boolean
|
|
passedDisallowedCheck: boolean
|
|
topManufacturers: string[]
|
|
topLabels: ManualsQdrantChunkLabel[]
|
|
}>
|
|
summary: {
|
|
totalCases: number
|
|
top3ManufacturerPasses: number
|
|
labelPasses: number
|
|
disallowedPasses: number
|
|
}
|
|
}
|
|
|
|
type StructuredSection = {
|
|
title?: string
|
|
pageNumber?: number
|
|
text?: string
|
|
}
|
|
|
|
type StructuredTroubleshooting = {
|
|
problem?: string
|
|
solution?: string
|
|
pageNumber?: number
|
|
}
|
|
|
|
type StructuredPart = {
|
|
partNumber?: string
|
|
description?: string
|
|
pageNumber?: number
|
|
}
|
|
|
|
type StructuredManualRecord = {
|
|
manualId?: string
|
|
manufacturer?: string
|
|
model?: string
|
|
manualType?: string
|
|
sourceFilename?: string
|
|
metadata?: {
|
|
pageCount?: number
|
|
}
|
|
content?: {
|
|
sections?: StructuredSection[]
|
|
troubleshooting?: StructuredTroubleshooting[]
|
|
partsDatabase?: StructuredPart[]
|
|
specifications?: Record<string, unknown>
|
|
}
|
|
}
|
|
|
|
type ExtractedPage = {
|
|
pageNumber?: number
|
|
text?: string
|
|
wordCount?: number
|
|
}
|
|
|
|
type ExtractedPartList = {
|
|
pageNumber?: number
|
|
parts?: StructuredPart[]
|
|
}
|
|
|
|
type ExtractedManualRecord = {
|
|
filename?: string
|
|
filepath?: string
|
|
text?: {
|
|
fullText?: string
|
|
pages?: ExtractedPage[]
|
|
}
|
|
partsLists?: ExtractedPartList[]
|
|
sections?: StructuredSection[]
|
|
}
|
|
|
|
type CatalogManual = {
|
|
manual: Manual
|
|
manufacturerCanonical: string
|
|
modelGuess: string | null
|
|
searchText: string
|
|
tokenSet: Set<string>
|
|
}
|
|
|
|
type ManualAccumulator = Omit<
|
|
ManualsQdrantManual,
|
|
| "metadataConfidence"
|
|
| "parseQuality"
|
|
| "duplicateRisk"
|
|
| "chunkCount"
|
|
| "highConfidenceChunkCount"
|
|
| "profiles"
|
|
| "embeddingTier"
|
|
| "flags"
|
|
> & {
|
|
metadataConfidenceTotal: number
|
|
metadataConfidenceSamples: number
|
|
hasStructured: boolean
|
|
hasTroubleshooting: boolean
|
|
hasOcrText: boolean
|
|
chunks: ManualsQdrantChunk[]
|
|
flagsSet: Set<string>
|
|
}
|
|
|
|
const DEFAULT_EVAL_CASES: ManualsQdrantEvaluationCase[] = [
|
|
{
|
|
id: "rvv-660-service",
|
|
query: "RVV 660 service manual",
|
|
profile: "internal_tech",
|
|
expectedManufacturer: "Royal Vendors",
|
|
},
|
|
{
|
|
id: "narco-bevmax-cooling",
|
|
query: "Narco bevmax not cooling",
|
|
profile: "public_safe",
|
|
expectedManufacturer: "Dixie-Narco",
|
|
expectedChunkLabels: ["service", "troubleshooting"],
|
|
},
|
|
{
|
|
id: "coin-mech-dollars",
|
|
query: "coin mech not accepting dollars",
|
|
profile: "public_safe",
|
|
expectedChunkLabels: ["troubleshooting", "parts"],
|
|
disallowedChunkLabels: ["brochure"],
|
|
},
|
|
{
|
|
id: "royal-coins",
|
|
query: "Royal machine not accepting coins",
|
|
profile: "public_safe",
|
|
expectedManufacturer: "Royal Vendors",
|
|
expectedChunkLabels: ["troubleshooting"],
|
|
},
|
|
{
|
|
id: "wiring-risky",
|
|
query: "Royal wiring diagram voltage issue",
|
|
profile: "public_safe",
|
|
expectedManufacturer: "Royal Vendors",
|
|
disallowedChunkLabels: ["wiring"],
|
|
},
|
|
{
|
|
id: "ambiguous-bad-query",
|
|
query: "manual for strange mystery vendor",
|
|
profile: "public_safe",
|
|
disallowedChunkLabels: ["brochure"],
|
|
},
|
|
]
|
|
|
|
export function getDefaultManualsQdrantEvaluationCases() {
|
|
return DEFAULT_EVAL_CASES
|
|
}
|
|
|
|
let manualsQdrantCorpusPromise: Promise<ManualsQdrantCorpus> | null = null
|
|
|
|
export function getManualsQdrantCorpus() {
|
|
if (!manualsQdrantCorpusPromise) {
|
|
manualsQdrantCorpusPromise = buildManualsQdrantCorpus()
|
|
}
|
|
|
|
return manualsQdrantCorpusPromise
|
|
}
|
|
|
|
export function resetManualsQdrantCorpusCache() {
|
|
manualsQdrantCorpusPromise = null
|
|
}
|
|
|
|
export async function buildManualsQdrantCorpus(): Promise<ManualsQdrantCorpus> {
|
|
const catalogManuals = await loadCatalogManuals()
|
|
const structuredRecords = await loadStructuredRecords()
|
|
const extractedRecords = await loadExtractedRecords()
|
|
|
|
const manuals = new Map<string, ManualAccumulator>()
|
|
const chunkDedup = new Set<string>()
|
|
|
|
for (const record of structuredRecords) {
|
|
const catalogMatch = matchCatalogManual(
|
|
[record.manualId, record.manufacturer, record.model]
|
|
.filter(Boolean)
|
|
.join(" "),
|
|
catalogManuals,
|
|
{
|
|
manufacturerHint: record.manufacturer || null,
|
|
modelHint: record.model || null,
|
|
}
|
|
)
|
|
const filenameHint =
|
|
catalogMatch?.manual.filename || record.sourceFilename || `${record.manualId || "manual"}.pdf`
|
|
const manual = getOrCreateManualAccumulator({
|
|
manuals,
|
|
catalogMatch,
|
|
filename: filenameHint,
|
|
recordManufacturer: record.manufacturer || null,
|
|
recordModel: record.model || null,
|
|
manualTypeHint: detectManualType(
|
|
`${record.manualType || ""} ${record.manualId || ""}`
|
|
),
|
|
categoryHint: catalogMatch?.manual.category || null,
|
|
metadataConfidence: catalogMatch ? 0.86 : 0.32,
|
|
sourceRecordId: record.sourceFilename || record.manualId || filenameHint,
|
|
})
|
|
|
|
manual.hasStructured = true
|
|
manual.hasTroubleshooting ||= Boolean(record.content?.troubleshooting?.length)
|
|
|
|
for (const chunk of buildStructuredChunks(record, manual)) {
|
|
addChunkToManual(manual, chunk, chunkDedup)
|
|
}
|
|
}
|
|
|
|
for (const record of extractedRecords) {
|
|
const filename = record.filename || basename(record.filepath || "manual.pdf")
|
|
const catalogMatch = matchCatalogManual(filename, catalogManuals)
|
|
const manual = getOrCreateManualAccumulator({
|
|
manuals,
|
|
catalogMatch,
|
|
filename,
|
|
recordManufacturer: null,
|
|
recordModel: null,
|
|
manualTypeHint: detectManualType(filename),
|
|
categoryHint: catalogMatch?.manual.category || null,
|
|
metadataConfidence:
|
|
catalogMatch && normalizeIdentifier(catalogMatch.manual.filename) ===
|
|
normalizeIdentifier(filename)
|
|
? 0.96
|
|
: catalogMatch
|
|
? 0.78
|
|
: 0.36,
|
|
sourceRecordId: record.filename || record.filepath || "unknown-extracted",
|
|
})
|
|
|
|
manual.hasOcrText ||= hasUsefulOcrText(record)
|
|
|
|
// Prefer structured/manual chunks where they exist and use OCR pages only
|
|
// as a fallback corpus for manuals we could not parse structurally.
|
|
if (manual.hasStructured) {
|
|
continue
|
|
}
|
|
|
|
for (const chunk of buildExtractedChunks(record, manual)) {
|
|
addChunkToManual(manual, chunk, chunkDedup)
|
|
}
|
|
}
|
|
|
|
const finalizedManuals = Array.from(manuals.values())
|
|
.map(finalizeManual)
|
|
.sort((left, right) => left.manualId.localeCompare(right.manualId))
|
|
const finalizedChunks = finalizedManuals
|
|
.flatMap((manual) => manual.chunks)
|
|
.sort((left, right) => left.chunkId.localeCompare(right.chunkId))
|
|
|
|
const stats = buildCorpusStats({
|
|
catalogManuals,
|
|
structuredRecords,
|
|
extractedRecords,
|
|
manuals: finalizedManuals,
|
|
chunks: finalizedChunks,
|
|
})
|
|
|
|
return {
|
|
generatedAt: new Date().toISOString(),
|
|
stats,
|
|
manuals: finalizedManuals,
|
|
chunks: finalizedChunks,
|
|
}
|
|
}
|
|
|
|
export function searchManualsQdrantCorpus(
|
|
corpus: ManualsQdrantCorpus,
|
|
query: string,
|
|
options?: {
|
|
limit?: number
|
|
profile?: ManualsQdrantProfile
|
|
}
|
|
): ManualsQdrantSearchResult[] {
|
|
const limit = options?.limit ?? 5
|
|
const profile = options?.profile ?? "internal_tech"
|
|
const normalizedQuery = normalizeText(query)
|
|
const queryTokens = tokenize(normalizedQuery)
|
|
const queryLower = normalizedQuery.toLowerCase()
|
|
|
|
return corpus.chunks
|
|
.filter((chunk) => {
|
|
return (
|
|
chunk.embeddingTier !== "exclude" &&
|
|
chunk.profiles.includes(profile) &&
|
|
chunk.text.trim().length > 0
|
|
)
|
|
})
|
|
.map((chunk) => ({
|
|
chunk,
|
|
score: scoreChunkForQuery(chunk, queryTokens, queryLower),
|
|
}))
|
|
.filter((entry) => entry.score > 0)
|
|
.sort((left, right) => right.score - left.score)
|
|
.slice(0, limit)
|
|
}
|
|
|
|
export function evaluateManualsQdrantCorpus(
|
|
corpus: ManualsQdrantCorpus,
|
|
cases: ManualsQdrantEvaluationCase[] = DEFAULT_EVAL_CASES
|
|
): ManualsQdrantEvaluationResult {
|
|
const results = cases.map((evaluationCase) => {
|
|
const topResults = searchManualsQdrantCorpus(corpus, evaluationCase.query, {
|
|
limit: 5,
|
|
profile: evaluationCase.profile,
|
|
})
|
|
const topManufacturers = Array.from(
|
|
new Set(topResults.map((result) => result.chunk.manufacturer))
|
|
)
|
|
const topLabels = Array.from(
|
|
new Set(topResults.flatMap((result) => result.chunk.labels))
|
|
)
|
|
|
|
return {
|
|
id: evaluationCase.id,
|
|
query: evaluationCase.query,
|
|
profile: evaluationCase.profile,
|
|
passedTop3Manufacturer: evaluationCase.expectedManufacturer
|
|
? topManufacturers
|
|
.slice(0, 3)
|
|
.includes(evaluationCase.expectedManufacturer)
|
|
: null,
|
|
passedTop5Label: evaluationCase.expectedChunkLabels
|
|
? evaluationCase.expectedChunkLabels.some((label) =>
|
|
topLabels.includes(label)
|
|
)
|
|
: true,
|
|
passedDisallowedCheck: evaluationCase.disallowedChunkLabels
|
|
? !topLabels.some((label) =>
|
|
evaluationCase.disallowedChunkLabels?.includes(label)
|
|
)
|
|
: true,
|
|
topManufacturers,
|
|
topLabels,
|
|
}
|
|
})
|
|
|
|
return {
|
|
cases: results,
|
|
summary: {
|
|
totalCases: results.length,
|
|
top3ManufacturerPasses: results.filter(
|
|
(result) => result.passedTop3Manufacturer !== false
|
|
).length,
|
|
labelPasses: results.filter((result) => result.passedTop5Label).length,
|
|
disallowedPasses: results.filter(
|
|
(result) => result.passedDisallowedCheck
|
|
).length,
|
|
},
|
|
}
|
|
}
|
|
|
|
export async function writeManualsQdrantArtifacts(args?: {
|
|
outputDir?: string
|
|
}) {
|
|
const outputDir = args?.outputDir || join(process.cwd(), "output", "manuals-qdrant")
|
|
const corpus = await buildManualsQdrantCorpus()
|
|
const evaluation = evaluateManualsQdrantCorpus(corpus)
|
|
const internalTechChunks = corpus.chunks.filter((chunk) =>
|
|
chunk.profiles.includes("internal_tech")
|
|
)
|
|
const publicSafeChunks = corpus.chunks.filter((chunk) =>
|
|
chunk.profiles.includes("public_safe")
|
|
)
|
|
const highConfidenceChunks = corpus.chunks.filter(
|
|
(chunk) => chunk.embeddingTier === "high_confidence"
|
|
)
|
|
|
|
await mkdir(outputDir, { recursive: true })
|
|
await writeFile(
|
|
join(outputDir, "summary.json"),
|
|
JSON.stringify(
|
|
{
|
|
generatedAt: corpus.generatedAt,
|
|
stats: corpus.stats,
|
|
evaluation: evaluation.summary,
|
|
},
|
|
null,
|
|
2
|
|
)
|
|
)
|
|
await writeFile(
|
|
join(outputDir, "manuals.json"),
|
|
JSON.stringify(corpus.manuals, null, 2)
|
|
)
|
|
await writeFile(
|
|
join(outputDir, "chunks.json"),
|
|
JSON.stringify(corpus.chunks, null, 2)
|
|
)
|
|
await writeFile(
|
|
join(outputDir, "chunks-internal-tech.json"),
|
|
JSON.stringify(internalTechChunks, null, 2)
|
|
)
|
|
await writeFile(
|
|
join(outputDir, "chunks-public-safe.json"),
|
|
JSON.stringify(publicSafeChunks, null, 2)
|
|
)
|
|
await writeFile(
|
|
join(outputDir, "chunks-high-confidence.json"),
|
|
JSON.stringify(highConfidenceChunks, null, 2)
|
|
)
|
|
await writeFile(
|
|
join(outputDir, "evaluation-cases.json"),
|
|
JSON.stringify(DEFAULT_EVAL_CASES, null, 2)
|
|
)
|
|
await writeFile(
|
|
join(outputDir, "evaluation-report.json"),
|
|
JSON.stringify(evaluation, null, 2)
|
|
)
|
|
|
|
return {
|
|
outputDir,
|
|
corpus,
|
|
evaluation,
|
|
}
|
|
}
|
|
|
|
async function loadCatalogManuals() {
|
|
const manuals = await scanManuals()
|
|
const catalog = manuals.map((manual) => {
|
|
const title = [
|
|
manual.filename,
|
|
manual.manufacturer,
|
|
manual.category,
|
|
...(manual.searchTerms || []),
|
|
...(manual.commonNames || []),
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
|
|
return {
|
|
manual,
|
|
manufacturerCanonical: normalizeManufacturer(manual.manufacturer),
|
|
modelGuess: extractModel(title),
|
|
searchText: normalizeText(title),
|
|
tokenSet: new Set(tokenize(title)),
|
|
} satisfies CatalogManual
|
|
})
|
|
|
|
return catalog
|
|
}
|
|
|
|
async function loadStructuredRecords() {
|
|
const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR)
|
|
const entries = await readdir(directory, { withFileTypes: true })
|
|
const files = entries
|
|
.filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json"))
|
|
.map((entry) => entry.name)
|
|
|
|
const records = await Promise.all(
|
|
files.map(async (filename) => {
|
|
const parsed = await readJsonFile<StructuredManualRecord>(
|
|
join(directory, filename)
|
|
)
|
|
return {
|
|
...parsed,
|
|
sourceFilename: filename,
|
|
manualId: parsed.manualId || stripExtension(filename),
|
|
}
|
|
})
|
|
)
|
|
|
|
const indexRecords = await readJsonFile<StructuredManualRecord[]>(
|
|
join(getManualsDataRoot(), ...STRUCTURED_MANUALS_INDEX_FILE)
|
|
)
|
|
|
|
if (indexRecords.length === 0) {
|
|
return records
|
|
}
|
|
|
|
const recordsByKey = new Map<string, string[]>()
|
|
|
|
for (const record of records) {
|
|
const key = getStructuredRecordMatchKey(record)
|
|
const existing = recordsByKey.get(key) || []
|
|
existing.push(record.sourceFilename || `${record.manualId || "structured"}.json`)
|
|
recordsByKey.set(key, existing)
|
|
}
|
|
|
|
return indexRecords.map((record, index) => {
|
|
const key = getStructuredRecordMatchKey(record)
|
|
const matchingFilenames = recordsByKey.get(key) || []
|
|
|
|
return {
|
|
...record,
|
|
sourceFilename:
|
|
matchingFilenames.shift() ||
|
|
`${normalizeIdentifier(record.manualId || `structured-record-${index + 1}`)}.json`,
|
|
}
|
|
})
|
|
}
|
|
|
|
async function loadExtractedRecords() {
|
|
return await readJsonFile<ExtractedManualRecord[]>(
|
|
join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE)
|
|
)
|
|
}
|
|
|
|
function matchCatalogManual(
|
|
rawQuery: string,
|
|
catalogManuals: CatalogManual[],
|
|
hints?: {
|
|
manufacturerHint?: string | null
|
|
modelHint?: string | null
|
|
}
|
|
) {
|
|
const normalizedQuery = normalizeText(rawQuery)
|
|
const tokens = tokenize(normalizedQuery)
|
|
const manufacturerHint = hints?.manufacturerHint
|
|
? normalizeManufacturer(hints.manufacturerHint)
|
|
: null
|
|
const modelHint = hints?.modelHint ? normalizeIdentifier(hints.modelHint) : null
|
|
|
|
const exactStemMatch = catalogManuals.find(
|
|
(catalogManual) =>
|
|
normalizeIdentifier(stripExtension(catalogManual.manual.filename)) ===
|
|
normalizeIdentifier(stripExtension(rawQuery))
|
|
)
|
|
if (exactStemMatch) {
|
|
return exactStemMatch
|
|
}
|
|
|
|
const scored = catalogManuals
|
|
.map((catalogManual) => {
|
|
let score = 0
|
|
|
|
if (manufacturerHint) {
|
|
score +=
|
|
catalogManual.manufacturerCanonical === manufacturerHint ? 16 : -4
|
|
}
|
|
|
|
if (modelHint) {
|
|
if (catalogManual.modelGuess === modelHint) {
|
|
score += 14
|
|
} else if (
|
|
catalogManual.searchText.includes(modelHint.replace(/-/g, " "))
|
|
) {
|
|
score += 8
|
|
}
|
|
}
|
|
|
|
if (
|
|
normalizedQuery &&
|
|
catalogManual.searchText.includes(normalizedQuery.toLowerCase())
|
|
) {
|
|
score += 20
|
|
}
|
|
|
|
for (const token of tokens) {
|
|
if (catalogManual.tokenSet.has(token)) {
|
|
score += 4
|
|
} else if (
|
|
token.length >= 4 &&
|
|
catalogManual.searchText.includes(token)
|
|
) {
|
|
score += 1.5
|
|
}
|
|
}
|
|
|
|
return { catalogManual, score }
|
|
})
|
|
.sort((left, right) => right.score - left.score)
|
|
|
|
return scored[0] && scored[0].score >= 10 ? scored[0].catalogManual : null
|
|
}
|
|
|
|
function getOrCreateManualAccumulator(args: {
|
|
manuals: Map<string, ManualAccumulator>
|
|
catalogMatch: CatalogManual | null
|
|
filename: string
|
|
recordManufacturer: string | null
|
|
recordModel: string | null
|
|
manualTypeHint: string
|
|
categoryHint: string | null
|
|
metadataConfidence: number
|
|
sourceRecordId: string
|
|
}) {
|
|
const manual = args.catalogMatch?.manual
|
|
const manufacturer = humanizeManufacturer(
|
|
manual?.manufacturer || args.recordManufacturer || "Other"
|
|
)
|
|
const model =
|
|
args.recordModel && !isPlaceholderValue(args.recordModel)
|
|
? sanitizeModel(args.recordModel)
|
|
: args.catalogMatch?.modelGuess || extractModel(args.filename)
|
|
const manualType = args.manualTypeHint || detectManualType(args.filename)
|
|
const manualId = buildCanonicalManualId({
|
|
catalogManual: manual || null,
|
|
manufacturer,
|
|
model,
|
|
manualType,
|
|
filename: args.filename,
|
|
})
|
|
const existing = args.manuals.get(manualId)
|
|
|
|
if (existing) {
|
|
existing.sourceFilenames = Array.from(
|
|
new Set([
|
|
...existing.sourceFilenames,
|
|
args.filename,
|
|
...(args.sourceRecordId.toLowerCase().endsWith(".json")
|
|
? [args.sourceRecordId]
|
|
: []),
|
|
])
|
|
)
|
|
existing.sourceRecordCount += 1
|
|
existing.metadataConfidenceTotal += args.metadataConfidence
|
|
existing.metadataConfidenceSamples += 1
|
|
if (args.categoryHint && !existing.category) {
|
|
existing.category = args.categoryHint
|
|
}
|
|
if (args.recordModel && !existing.model && !isPlaceholderValue(args.recordModel)) {
|
|
existing.model = sanitizeModel(args.recordModel)
|
|
}
|
|
if (args.recordManufacturer && existing.manufacturer === "Other") {
|
|
existing.manufacturer = humanizeManufacturer(args.recordManufacturer)
|
|
existing.manufacturerCanonical = normalizeManufacturer(existing.manufacturer)
|
|
}
|
|
existing.flagsSet.add(
|
|
args.sourceRecordId === manualId ? "merged-duplicate-source" : "merged-source"
|
|
)
|
|
return existing
|
|
}
|
|
|
|
const created: ManualAccumulator = {
|
|
manualId,
|
|
title: humanizeTitle(stripExtension(manual?.filename || args.filename)),
|
|
manufacturer,
|
|
manufacturerCanonical: normalizeManufacturer(manufacturer),
|
|
model: model || null,
|
|
manualType,
|
|
category: args.categoryHint,
|
|
manualUrl: manual ? getManualUrl(manual) : null,
|
|
thumbnailUrl: manual ? getThumbnailUrl(manual) : null,
|
|
sourceFilenames: Array.from(
|
|
new Set([
|
|
args.filename,
|
|
...(args.sourceRecordId.toLowerCase().endsWith(".json")
|
|
? [args.sourceRecordId]
|
|
: []),
|
|
])
|
|
),
|
|
sourceRecordCount: 1,
|
|
metadataConfidenceTotal: args.metadataConfidence,
|
|
metadataConfidenceSamples: 1,
|
|
flagsSet: new Set(
|
|
args.catalogMatch ? [] : ["catalog-match-missing"]
|
|
),
|
|
hasStructured: false,
|
|
hasTroubleshooting: false,
|
|
hasOcrText: false,
|
|
chunks: [],
|
|
}
|
|
|
|
if (!args.catalogMatch && isPlaceholderValue(args.recordManufacturer || "")) {
|
|
created.flagsSet.add("metadata-manufacturer-placeholder")
|
|
}
|
|
|
|
args.manuals.set(manualId, created)
|
|
return created
|
|
}
|
|
|
|
function buildStructuredChunks(
|
|
record: StructuredManualRecord,
|
|
manual: ManualAccumulator
|
|
) {
|
|
const chunks: ManualsQdrantChunk[] = []
|
|
|
|
for (const section of record.content?.sections || []) {
|
|
const text = cleanText(section.text || "")
|
|
if (!text) {
|
|
continue
|
|
}
|
|
chunks.push(
|
|
createChunk({
|
|
manual,
|
|
text,
|
|
pageNumber: section.pageNumber ?? null,
|
|
sectionTitle: cleanText(section.title || "") || null,
|
|
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
|
|
sourceKind: "structured_section",
|
|
})
|
|
)
|
|
}
|
|
|
|
for (const item of record.content?.troubleshooting || []) {
|
|
const problem = cleanText(item.problem || "")
|
|
const solution = cleanText(item.solution || "")
|
|
const text = cleanText(
|
|
[
|
|
problem ? `Problem: ${problem}` : "",
|
|
solution ? `Likely cause or solution: ${solution}` : "",
|
|
]
|
|
.filter(Boolean)
|
|
.join("\n")
|
|
)
|
|
if (!text) {
|
|
continue
|
|
}
|
|
chunks.push(
|
|
createChunk({
|
|
manual,
|
|
text,
|
|
pageNumber: item.pageNumber ?? null,
|
|
sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting",
|
|
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
|
|
sourceKind: "troubleshooting",
|
|
})
|
|
)
|
|
}
|
|
|
|
const partsByPage = new Map<number, string[]>()
|
|
for (const item of record.content?.partsDatabase || []) {
|
|
const partNumber = cleanText(item.partNumber || "")
|
|
const description = cleanText(item.description || "")
|
|
if (partNumber.length < 2 && description.length < 4) {
|
|
continue
|
|
}
|
|
const pageNumber = item.pageNumber ?? 0
|
|
const parts = partsByPage.get(pageNumber) || []
|
|
parts.push(description ? `Part ${partNumber}: ${description}` : `Part ${partNumber}`)
|
|
partsByPage.set(pageNumber, parts)
|
|
}
|
|
|
|
for (const [pageNumber, parts] of partsByPage.entries()) {
|
|
chunks.push(
|
|
createChunk({
|
|
manual,
|
|
text: parts.slice(0, 12).join("\n"),
|
|
pageNumber: pageNumber || null,
|
|
sectionTitle: "Parts reference",
|
|
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
|
|
sourceKind: "parts_database",
|
|
})
|
|
)
|
|
}
|
|
|
|
if (record.content?.specifications) {
|
|
const specsText = cleanText(
|
|
Object.entries(record.content.specifications)
|
|
.map(([key, value]) => `${humanizeTitle(key)}: ${String(value)}`)
|
|
.join("\n")
|
|
)
|
|
if (specsText) {
|
|
chunks.push(
|
|
createChunk({
|
|
manual,
|
|
text: specsText,
|
|
pageNumber: null,
|
|
sectionTitle: "Specifications",
|
|
sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
|
|
sourceKind: "structured_section",
|
|
})
|
|
)
|
|
}
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
function buildExtractedChunks(
|
|
record: ExtractedManualRecord,
|
|
manual: ManualAccumulator
|
|
) {
|
|
const chunks: ManualsQdrantChunk[] = []
|
|
|
|
for (const page of record.text?.pages || []) {
|
|
const text = cleanText(page.text || "")
|
|
if (!text || (page.wordCount || 0) === 0) {
|
|
continue
|
|
}
|
|
|
|
chunks.push(
|
|
createChunk({
|
|
manual,
|
|
text,
|
|
pageNumber: page.pageNumber ?? null,
|
|
sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "OCR page",
|
|
sourceFilename: record.filename || null,
|
|
sourceKind: "ocr_page",
|
|
})
|
|
)
|
|
}
|
|
|
|
for (const list of record.partsLists || []) {
|
|
const parts = (list.parts || [])
|
|
.map((part) => {
|
|
const partNumber = cleanText(part.partNumber || "")
|
|
const description = cleanText(part.description || "")
|
|
return description
|
|
? `Part ${partNumber}: ${description}`
|
|
: partNumber
|
|
? `Part ${partNumber}`
|
|
: ""
|
|
})
|
|
.filter(Boolean)
|
|
.slice(0, 12)
|
|
|
|
if (parts.length === 0) {
|
|
continue
|
|
}
|
|
|
|
chunks.push(
|
|
createChunk({
|
|
manual,
|
|
text: parts.join("\n"),
|
|
pageNumber: list.pageNumber ?? null,
|
|
sectionTitle: "Parts reference",
|
|
sourceFilename: record.filename || null,
|
|
sourceKind: "parts_database",
|
|
})
|
|
)
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
function addChunkToManual(
|
|
manual: ManualAccumulator,
|
|
chunk: ManualsQdrantChunk,
|
|
chunkDedup: Set<string>
|
|
) {
|
|
const dedupKey = [
|
|
chunk.manualId,
|
|
chunk.pageNumber ?? "na",
|
|
normalizeIdentifier(chunk.sectionTitle || ""),
|
|
normalizeIdentifier(chunk.text.slice(0, 180)),
|
|
].join("::")
|
|
|
|
if (chunkDedup.has(dedupKey)) {
|
|
manual.flagsSet.add("duplicate-chunk-collapsed")
|
|
return
|
|
}
|
|
|
|
chunkDedup.add(dedupKey)
|
|
manual.chunks.push(chunk)
|
|
}
|
|
|
|
function finalizeManual(manual: ManualAccumulator): ManualsQdrantManual & {
|
|
chunks: ManualsQdrantChunk[]
|
|
} {
|
|
const metadataConfidence = clamp(
|
|
manual.metadataConfidenceTotal / manual.metadataConfidenceSamples
|
|
)
|
|
const duplicateRisk = clamp((manual.sourceRecordCount - 1) / 4)
|
|
const highConfidenceChunkCount = manual.chunks.filter(
|
|
(chunk) => chunk.embeddingTier === "high_confidence"
|
|
).length
|
|
const parseQuality = clamp(
|
|
metadataConfidence * 0.4 +
|
|
(manual.hasStructured ? 0.2 : 0) +
|
|
(manual.hasTroubleshooting ? 0.15 : 0) +
|
|
(manual.hasOcrText ? 0.1 : 0) +
|
|
clamp(highConfidenceChunkCount / 8) * 0.25 -
|
|
duplicateRisk * 0.15
|
|
)
|
|
const embeddingTier: ManualsEmbeddingTier =
|
|
parseQuality >= 0.72 && highConfidenceChunkCount > 0
|
|
? "high_confidence"
|
|
: parseQuality >= 0.46 && manual.chunks.length > 0
|
|
? "fallback"
|
|
: "exclude"
|
|
|
|
const profiles = buildProfiles({
|
|
labels: Array.from(new Set(manual.chunks.flatMap((chunk) => chunk.labels))),
|
|
embeddingTier,
|
|
overallQuality: parseQuality,
|
|
isRisky: manual.chunks.some((chunk) => chunk.isRisky),
|
|
})
|
|
|
|
const finalizedChunks = manual.chunks.map((chunk) => {
|
|
return {
|
|
...chunk,
|
|
manufacturer: manual.manufacturer,
|
|
manufacturerCanonical: manual.manufacturerCanonical,
|
|
model: manual.model,
|
|
manualType: manual.manualType,
|
|
category: manual.category,
|
|
manualUrl: manual.manualUrl,
|
|
thumbnailUrl: manual.thumbnailUrl,
|
|
profiles: buildProfiles({
|
|
labels: chunk.labels,
|
|
embeddingTier: chunk.embeddingTier,
|
|
overallQuality: chunk.overallQuality,
|
|
isRisky: chunk.isRisky,
|
|
}),
|
|
}
|
|
})
|
|
|
|
return {
|
|
manualId: manual.manualId,
|
|
title: manual.title,
|
|
manufacturer: manual.manufacturer,
|
|
manufacturerCanonical: manual.manufacturerCanonical,
|
|
model: manual.model,
|
|
manualType: manual.manualType,
|
|
category: manual.category,
|
|
manualUrl: manual.manualUrl,
|
|
thumbnailUrl: manual.thumbnailUrl,
|
|
sourceFilenames: Array.from(new Set(manual.sourceFilenames)).sort(),
|
|
sourceRecordCount: manual.sourceRecordCount,
|
|
metadataConfidence,
|
|
parseQuality,
|
|
duplicateRisk,
|
|
chunkCount: finalizedChunks.length,
|
|
highConfidenceChunkCount,
|
|
profiles,
|
|
embeddingTier,
|
|
flags: Array.from(manual.flagsSet).sort(),
|
|
chunks: finalizedChunks,
|
|
}
|
|
}
|
|
|
|
function buildCorpusStats(args: {
|
|
catalogManuals: CatalogManual[]
|
|
structuredRecords: StructuredManualRecord[]
|
|
extractedRecords: ExtractedManualRecord[]
|
|
manuals: Array<ManualsQdrantManual & { chunks: ManualsQdrantChunk[] }>
|
|
chunks: ManualsQdrantChunk[]
|
|
}): ManualsQdrantCorpusStats {
|
|
const manualsByManufacturer: Record<string, number> = {}
|
|
const chunksByLabel: Record<string, number> = {}
|
|
const profileCounts: Record<ManualsQdrantProfile, number> = {
|
|
public_safe: 0,
|
|
internal_tech: 0,
|
|
}
|
|
|
|
for (const manual of args.manuals) {
|
|
manualsByManufacturer[manual.manufacturer] =
|
|
(manualsByManufacturer[manual.manufacturer] || 0) + 1
|
|
}
|
|
|
|
for (const chunk of args.chunks) {
|
|
for (const label of chunk.labels) {
|
|
chunksByLabel[label] = (chunksByLabel[label] || 0) + 1
|
|
}
|
|
|
|
for (const profile of chunk.profiles) {
|
|
profileCounts[profile] += 1
|
|
}
|
|
}
|
|
|
|
return {
|
|
catalogManuals: args.catalogManuals.length,
|
|
structuredRecords: args.structuredRecords.length,
|
|
extractedRecords: args.extractedRecords.length,
|
|
normalizedManuals: args.manuals.length,
|
|
chunkCount: args.chunks.length,
|
|
highConfidenceChunks: args.chunks.filter(
|
|
(chunk) => chunk.embeddingTier === "high_confidence"
|
|
).length,
|
|
fallbackChunks: args.chunks.filter(
|
|
(chunk) => chunk.embeddingTier === "fallback"
|
|
).length,
|
|
excludedChunks: args.chunks.filter(
|
|
(chunk) => chunk.embeddingTier === "exclude"
|
|
).length,
|
|
manualsByManufacturer,
|
|
chunksByLabel,
|
|
profileCounts,
|
|
}
|
|
}
|
|
|
|
function createChunk(args: {
|
|
manual: ManualAccumulator
|
|
text: string
|
|
pageNumber: number | null
|
|
sectionTitle: string | null
|
|
sourceFilename: string | null
|
|
sourceKind: ManualsQdrantChunk["sourceKind"]
|
|
}): ManualsQdrantChunk {
|
|
const cleanedText = cleanText(args.text)
|
|
const labels = deriveChunkLabels({
|
|
text: cleanedText,
|
|
sectionTitle: args.sectionTitle,
|
|
sourceKind: args.sourceKind,
|
|
manualType: args.manual.manualType,
|
|
})
|
|
const metadataConfidence = clamp(
|
|
args.manual.metadataConfidenceTotal / args.manual.metadataConfidenceSamples
|
|
)
|
|
const textQuality = scoreTextQuality(cleanedText, labels)
|
|
const overallQuality = clamp(textQuality * 0.65 + metadataConfidence * 0.35)
|
|
const isRisky =
|
|
labels.includes("wiring") ||
|
|
TECH_RISK_KEYWORDS.some((keyword) =>
|
|
normalizeText(cleanedText).includes(normalizeText(keyword))
|
|
)
|
|
const embeddingTier = deriveEmbeddingTier({
|
|
labels,
|
|
overallQuality,
|
|
sourceKind: args.sourceKind,
|
|
isRisky,
|
|
})
|
|
|
|
return {
|
|
chunkId: normalizeIdentifier(
|
|
`${args.manual.manualId} ${args.pageNumber ?? "na"} ${args.sectionTitle || ""} ${cleanedText.slice(0, 80)}`
|
|
),
|
|
manualId: args.manual.manualId,
|
|
title: args.manual.title,
|
|
manufacturer: args.manual.manufacturer,
|
|
manufacturerCanonical: args.manual.manufacturerCanonical,
|
|
model: args.manual.model,
|
|
manualType: args.manual.manualType,
|
|
category: args.manual.category,
|
|
pageNumber: args.pageNumber,
|
|
sectionTitle: args.sectionTitle,
|
|
text: cleanedText,
|
|
sourceFilename: args.sourceFilename,
|
|
sourceKind: args.sourceKind,
|
|
labels,
|
|
manualUrl: args.manual.manualUrl,
|
|
thumbnailUrl: args.manual.thumbnailUrl,
|
|
metadataConfidence,
|
|
textQuality,
|
|
overallQuality,
|
|
embeddingTier,
|
|
profiles: buildProfiles({
|
|
labels,
|
|
embeddingTier,
|
|
overallQuality,
|
|
isRisky,
|
|
}),
|
|
isRisky,
|
|
flags: buildChunkFlags(cleanedText, labels, overallQuality),
|
|
}
|
|
}
|
|
|
|
function scoreChunkForQuery(
|
|
chunk: ManualsQdrantChunk,
|
|
queryTokens: string[],
|
|
queryLower: string
|
|
) {
|
|
const chunkText = normalizeText(
|
|
[
|
|
chunk.title,
|
|
chunk.manufacturer,
|
|
chunk.model,
|
|
chunk.sectionTitle,
|
|
chunk.text,
|
|
...chunk.labels,
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
)
|
|
const chunkTokens = new Set(tokenize(chunkText))
|
|
|
|
let score = chunk.overallQuality * 10
|
|
|
|
for (const token of queryTokens) {
|
|
if (chunkTokens.has(token)) {
|
|
score += 3.5
|
|
} else if (token.length >= 4 && chunkText.includes(token)) {
|
|
score += 1
|
|
}
|
|
}
|
|
|
|
if (
|
|
(queryLower.includes("error") ||
|
|
queryLower.includes("not ") ||
|
|
queryLower.includes("wont") ||
|
|
queryLower.includes("won t")) &&
|
|
chunk.labels.includes("troubleshooting")
|
|
) {
|
|
score += 10
|
|
}
|
|
|
|
if (
|
|
(queryLower.includes("parts") ||
|
|
queryLower.includes("part") ||
|
|
queryLower.includes("coin") ||
|
|
queryLower.includes("bill")) &&
|
|
chunk.labels.includes("parts")
|
|
) {
|
|
score += 7
|
|
}
|
|
|
|
if (
|
|
(queryLower.includes("manual") || queryLower.includes("service")) &&
|
|
chunk.labels.includes("service")
|
|
) {
|
|
score += 5
|
|
}
|
|
|
|
if (queryLower.includes("wiring") && chunk.labels.includes("wiring")) {
|
|
score += 6
|
|
}
|
|
|
|
if (chunk.labels.includes("brochure")) {
|
|
score -= 5
|
|
}
|
|
|
|
if (chunk.labels.includes("toc") || chunk.labels.includes("flowchart")) {
|
|
score -= 8
|
|
}
|
|
|
|
return score
|
|
}
|
|
|
|
function deriveChunkLabels(args: {
|
|
text: string
|
|
sectionTitle: string | null
|
|
sourceKind: ManualsQdrantChunk["sourceKind"]
|
|
manualType: string
|
|
}): ManualsQdrantChunkLabel[] {
|
|
const labels = new Set<ManualsQdrantChunkLabel>()
|
|
const haystack = normalizeText(
|
|
[args.sectionTitle, args.text, args.manualType].filter(Boolean).join(" ")
|
|
)
|
|
|
|
if (
|
|
args.sourceKind === "troubleshooting" ||
|
|
TROUBLESHOOTING_KEYWORDS.some((keyword) =>
|
|
haystack.includes(normalizeText(keyword))
|
|
)
|
|
) {
|
|
labels.add("troubleshooting")
|
|
labels.add("service")
|
|
}
|
|
|
|
if (
|
|
args.sourceKind === "parts_database" ||
|
|
PARTS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))
|
|
) {
|
|
labels.add("parts")
|
|
}
|
|
|
|
if (
|
|
args.manualType === "operator" ||
|
|
OPERATOR_KEYWORDS.some((keyword) =>
|
|
haystack.includes(normalizeText(keyword))
|
|
)
|
|
) {
|
|
labels.add("operator")
|
|
}
|
|
|
|
if (
|
|
args.manualType === "service" ||
|
|
haystack.includes("technical manual") ||
|
|
haystack.includes("repair")
|
|
) {
|
|
labels.add("service")
|
|
}
|
|
|
|
if (SPECS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))) {
|
|
labels.add("specs")
|
|
}
|
|
|
|
if (
|
|
haystack.includes("table of contents") ||
|
|
haystack.includes("list of figures") ||
|
|
haystack.startsWith("contents")
|
|
) {
|
|
labels.add("toc")
|
|
}
|
|
|
|
if (
|
|
haystack.includes("flow chart") ||
|
|
haystack.includes("flowchart") ||
|
|
looksLikeFlowchart(args.text)
|
|
) {
|
|
labels.add("flowchart")
|
|
}
|
|
|
|
if (
|
|
haystack.includes("wiring") ||
|
|
haystack.includes("electrical") ||
|
|
haystack.includes("schematic") ||
|
|
haystack.includes("voltage")
|
|
) {
|
|
labels.add("wiring")
|
|
}
|
|
|
|
if (
|
|
args.manualType === "brochure" ||
|
|
MARKETING_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))
|
|
) {
|
|
labels.add("brochure")
|
|
}
|
|
|
|
if (labels.size === 0) {
|
|
labels.add("general")
|
|
}
|
|
|
|
return Array.from(labels).sort()
|
|
}
|
|
|
|
function deriveEmbeddingTier(args: {
|
|
labels: ManualsQdrantChunkLabel[]
|
|
overallQuality: number
|
|
sourceKind: ManualsQdrantChunk["sourceKind"]
|
|
isRisky: boolean
|
|
}): ManualsEmbeddingTier {
|
|
if (
|
|
args.overallQuality < 0.34 ||
|
|
args.labels.includes("toc") ||
|
|
args.labels.includes("flowchart")
|
|
) {
|
|
return "exclude"
|
|
}
|
|
|
|
if (args.labels.includes("brochure")) {
|
|
return args.overallQuality >= 0.62 ? "fallback" : "exclude"
|
|
}
|
|
|
|
if (
|
|
args.sourceKind === "ocr_page" &&
|
|
args.overallQuality < 0.58 &&
|
|
!args.labels.includes("troubleshooting")
|
|
) {
|
|
return "fallback"
|
|
}
|
|
|
|
if (args.isRisky && args.overallQuality < 0.7) {
|
|
return "fallback"
|
|
}
|
|
|
|
return args.overallQuality >= 0.64 ? "high_confidence" : "fallback"
|
|
}
|
|
|
|
function buildProfiles(args: {
|
|
labels: ManualsQdrantChunkLabel[]
|
|
embeddingTier: ManualsEmbeddingTier
|
|
overallQuality: number
|
|
isRisky: boolean
|
|
}): ManualsQdrantProfile[] {
|
|
if (args.embeddingTier === "exclude") {
|
|
return []
|
|
}
|
|
|
|
const profiles = new Set<ManualsQdrantProfile>()
|
|
|
|
if (!args.labels.includes("brochure") && !args.labels.includes("toc")) {
|
|
profiles.add("internal_tech")
|
|
}
|
|
|
|
if (
|
|
!args.isRisky &&
|
|
args.overallQuality >= 0.56 &&
|
|
!args.labels.includes("brochure") &&
|
|
!args.labels.includes("flowchart") &&
|
|
!args.labels.includes("toc") &&
|
|
!args.labels.includes("wiring")
|
|
) {
|
|
profiles.add("public_safe")
|
|
}
|
|
|
|
return Array.from(profiles).sort()
|
|
}
|
|
|
|
function buildChunkFlags(
|
|
text: string,
|
|
labels: ManualsQdrantChunkLabel[],
|
|
overallQuality: number
|
|
) {
|
|
const flags = new Set<string>()
|
|
|
|
if (overallQuality < 0.5) {
|
|
flags.add("low-quality")
|
|
}
|
|
|
|
if (labels.includes("brochure")) {
|
|
flags.add("marketing-heavy")
|
|
}
|
|
|
|
if (labels.includes("wiring")) {
|
|
flags.add("risky-technical")
|
|
}
|
|
|
|
if (looksLikeOcrGarbage(text)) {
|
|
flags.add("ocr-noisy")
|
|
}
|
|
|
|
return Array.from(flags).sort()
|
|
}
|
|
|
|
function scoreTextQuality(
|
|
text: string,
|
|
labels: ManualsQdrantChunkLabel[]
|
|
) {
|
|
const alphaChars = text.replace(/[^a-z]/gi, "").length
|
|
const allChars = text.replace(/\s+/g, "").length || 1
|
|
const alphaRatio = alphaChars / allChars
|
|
const tokenCount = tokenize(text).length
|
|
const uppercaseBursts = (text.match(/\b[A-Z]{4,}\b/g) || []).length
|
|
const sentenceLike = (text.match(/[.!?]/g) || []).length
|
|
|
|
let score =
|
|
clamp(alphaRatio) * 0.35 +
|
|
clamp(tokenCount / 120) * 0.3 +
|
|
clamp(sentenceLike / 8) * 0.15 +
|
|
(looksLikeOcrGarbage(text) ? 0 : 0.2)
|
|
|
|
if (labels.includes("troubleshooting")) {
|
|
score += 0.12
|
|
}
|
|
|
|
if (labels.includes("brochure")) {
|
|
score -= 0.1
|
|
}
|
|
|
|
if (uppercaseBursts > 18) {
|
|
score -= 0.12
|
|
}
|
|
|
|
return clamp(score)
|
|
}
|
|
|
|
function detectManualType(value: string) {
|
|
const normalized = normalizeText(value)
|
|
|
|
if (
|
|
normalized.includes("brochure") ||
|
|
normalized.includes("product notice") ||
|
|
normalized.includes("warranty")
|
|
) {
|
|
return "brochure"
|
|
}
|
|
|
|
if (normalized.includes("parts")) {
|
|
return "parts"
|
|
}
|
|
|
|
if (normalized.includes("operator") || normalized.includes("user guide")) {
|
|
return "operator"
|
|
}
|
|
|
|
if (
|
|
normalized.includes("service") ||
|
|
normalized.includes("repair") ||
|
|
normalized.includes("technical")
|
|
) {
|
|
return "service"
|
|
}
|
|
|
|
return "manual"
|
|
}
|
|
|
|
function hasUsefulOcrText(record: ExtractedManualRecord) {
|
|
const words = (record.text?.pages || []).reduce(
|
|
(sum, page) => sum + (page.wordCount || 0),
|
|
0
|
|
)
|
|
return words > 0
|
|
}
|
|
|
|
function looksLikeFlowchart(text: string) {
|
|
const normalized = text.replace(/\s+/g, " ").trim()
|
|
return (
|
|
normalized.includes("* # #") ||
|
|
normalized.includes("press selection number") ||
|
|
normalized.split("\n").filter((line) => /^[*#A-Z0-9 ()/-]+$/.test(line.trim()))
|
|
.length > 8
|
|
)
|
|
}
|
|
|
|
function looksLikeOcrGarbage(text: string) {
|
|
const normalized = text.replace(/\s+/g, " ").trim()
|
|
const weirdChars = (normalized.match(/[^\x20-\x7E\n\r\t]/g) || []).length
|
|
const singleLetterBursts = (normalized.match(/\b[A-Z](?:\s+[A-Z]){4,}\b/g) || [])
|
|
.length
|
|
|
|
return weirdChars > 6 || singleLetterBursts > 0
|
|
}
|
|
|
|
function extractModel(value: string) {
|
|
const matches = normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9-]*\b/g) || []
|
|
return matches[0] ? sanitizeModel(matches[0]) : null
|
|
}
|
|
|
|
function sanitizeModel(value: string) {
|
|
const normalized = normalizeIdentifier(value).replace(/^unknown-?/, "")
|
|
return normalized || null
|
|
}
|
|
|
|
function normalizeManufacturer(value: string | null | undefined): string {
|
|
const normalized = normalizeText(value || "")
|
|
|
|
for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
|
|
if (
|
|
canonical.toLowerCase() === normalized ||
|
|
aliases.some((alias) => normalized.includes(normalizeText(alias)))
|
|
) {
|
|
return canonical
|
|
}
|
|
}
|
|
|
|
if (!normalized || isPlaceholderValue(normalized) || /^\d/.test(normalized)) {
|
|
return "Other"
|
|
}
|
|
|
|
return toTitleCase(normalized)
|
|
}
|
|
|
|
function humanizeManufacturer(value: string): string {
|
|
return normalizeManufacturer(value)
|
|
}
|
|
|
|
function humanizeTitle(value: string) {
|
|
return value
|
|
.replace(/[-_]+/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim()
|
|
}
|
|
|
|
function isPlaceholderValue(value: string) {
|
|
const normalized = normalizeText(value)
|
|
return (
|
|
!normalized ||
|
|
normalized === "unknown" ||
|
|
normalized === "manual" ||
|
|
/^\d+$/.test(normalized)
|
|
)
|
|
}
|
|
|
|
function cleanText(value: string) {
|
|
return value.replace(/\u00ad/g, "").replace(/\s+/g, " ").trim()
|
|
}
|
|
|
|
function normalizeText(value: string) {
|
|
return value
|
|
.normalize("NFKD")
|
|
.replace(/[^a-zA-Z0-9]+/g, " ")
|
|
.toLowerCase()
|
|
.trim()
|
|
}
|
|
|
|
function tokenize(value: string) {
|
|
return normalizeText(value)
|
|
.split(" ")
|
|
.map((token) => token.trim())
|
|
.filter(
|
|
(token) =>
|
|
token.length > 1 &&
|
|
!STOPWORDS.has(token) &&
|
|
!/^\d+$/.test(token)
|
|
)
|
|
}
|
|
|
|
function normalizeIdentifier(value: string) {
|
|
return normalizeText(stripExtension(value)).replace(/\s+/g, "-")
|
|
}
|
|
|
|
function stripExtension(value: string) {
|
|
return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "")
|
|
}
|
|
|
|
function getStructuredRecordMatchKey(record: StructuredManualRecord) {
|
|
return [
|
|
normalizeIdentifier(record.manualId || ""),
|
|
normalizeManufacturer(record.manufacturer),
|
|
sanitizeModel(record.model || "") || "unknown",
|
|
detectManualType(record.manualType || ""),
|
|
].join("::")
|
|
}
|
|
|
|
function buildCanonicalManualId(args: {
|
|
catalogManual: Manual | null
|
|
manufacturer: string
|
|
model: string | null
|
|
manualType: string
|
|
filename: string
|
|
}) {
|
|
if (args.catalogManual) {
|
|
return normalizeIdentifier(args.catalogManual.path || args.catalogManual.filename)
|
|
}
|
|
|
|
const normalizedManufacturer = normalizeManufacturer(args.manufacturer)
|
|
const hasReliableIdentity =
|
|
normalizedManufacturer !== "Other" || Boolean(args.model)
|
|
|
|
if (hasReliableIdentity) {
|
|
return normalizeIdentifier(
|
|
`${normalizedManufacturer} ${args.model || "unknown"} ${args.manualType}`
|
|
)
|
|
}
|
|
|
|
return normalizeIdentifier(`${args.filename} ${args.manualType}`)
|
|
}
|
|
|
|
function toTitleCase(value: string) {
|
|
return value
|
|
.split(" ")
|
|
.filter(Boolean)
|
|
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
|
|
.join(" ")
|
|
}
|
|
|
|
function clamp(value: number) {
|
|
return Math.max(0, Math.min(1, value))
|
|
}
|
|
|
|
async function readJsonFile<T>(path: string) {
|
|
return JSON.parse(await readFile(path, "utf8")) as T
|
|
}
|