Rocky_Mountain_Vending/lib/manuals-qdrant-corpus.ts

import { mkdir, readFile, readdir, writeFile } from "node:fs/promises"
import { basename, join } from "node:path"
import { scanManuals } from "@/lib/manuals"
import { getManualsDataRoot } from "@/lib/manuals-paths"
import type { Manual } from "@/lib/manuals-types"
import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types"

const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"]
const STRUCTURED_MANUALS_DIR = [
  ...MANUALS_OPTIMIZED_ROOT,
  "training_data",
  "structured",
]
const STRUCTURED_MANUALS_INDEX_FILE = [
  ...MANUALS_OPTIMIZED_ROOT,
  "training_data",
  "all_structured_data.json",
]
const EXTRACTED_CONTENT_FILE = [
  ...MANUALS_OPTIMIZED_ROOT,
  "extracted_content",
  "manuals_extracted_content.json",
]

const STOPWORDS = new Set([
  "a",
  "an",
  "and",
  "are",
  "at",
  "be",
  "by",
  "for",
  "from",
  "in",
  "is",
  "it",
  "manual",
  "machine",
  "of",
  "on",
  "or",
  "service",
  "the",
  "to",
  "with",
])

const TECH_RISK_KEYWORDS = [
  "wiring",
  "diagram",
  "schematic",
  "electrical",
  "voltage",
  "jumper",
  "compressor",
  "refrigerant",
  "bypass",
  "board level",
]

const MARKETING_KEYWORDS = [
  "increase sales",
  "more profits",
  "contact us",
  "operator can double up",
  "your employees",
  "productivity",
  "variety",
  "brochure",
]

const SPECS_KEYWORDS = [
  "dimensions",
  "height:",
  "width:",
  "depth:",
  "shipping weight",
  "electrical:",
  "listings:",
  "capacity",
  "voltage",
]

const TROUBLESHOOTING_KEYWORDS = [
  "probable cause",
  "solution",
  "troubleshooting",
  "not accepting",
  "will not vend",
  "check fuse",
  "error code",
]

const OPERATOR_KEYWORDS = [
  "user guide",
  "operators guide",
  "operation",
  "programming",
  "setup guide",
  "how to",
]

const PARTS_KEYWORDS = [
  "parts manual",
  "parts reference",
  "part number",
  "parts list",
  "exploded view",
]

const MANUFACTURER_ALIASES: Record<string, string[]> = {
  "AP": [
    "ap",
    "automatic products",
    "automatic-products",
    "snackshop",
  ],
  "Other": ["other", "unknown", "bill mechs", "coin mechs"],
  "Coinco": ["coinco"],
  "Crane": [
    "crane",
    "national vendors",
    "national",
    "merchant",
    "merchant series",
  ],
  "Dixie-Narco": ["dixie", "narco", "dixie narco", "dixie-narco", "bevmax"],
  "GPL": ["gpl", "general products"],
  "MEI Mars": ["mei", "mars", "bill validator"],
  "Royal Vendors": ["royal", "royal vendors", "royal vendor", "rvv"],
  "Rowe": ["rowe"],
  "Seaga": ["seaga"],
  "USI": ["usi", "u select it", "u-select-it", "uselectit"],
  "Vendo": ["vendo", "sanden"],
}

export type ManualsQdrantProfile = "public_safe" | "internal_tech"

export type ManualsQdrantChunkLabel =
  | "brochure"
  | "flowchart"
  | "operator"
  | "parts"
  | "service"
  | "specs"
  | "toc"
  | "troubleshooting"
  | "wiring"
  | "general"

export type ManualsEmbeddingTier =
  | "high_confidence"
  | "fallback"
  | "exclude"

export type ManualsQdrantManual = {
  manualId: string
  title: string
  manufacturer: string
  manufacturerCanonical: string
  model: string | null
  manualType: string
  category: string | null
  manualUrl: string | null
  thumbnailUrl: string | null
  sourceFilenames: string[]
  sourceRecordCount: number
  metadataConfidence: number
  parseQuality: number
  duplicateRisk: number
  chunkCount: number
  highConfidenceChunkCount: number
  profiles: ManualsQdrantProfile[]
  embeddingTier: ManualsEmbeddingTier
  flags: string[]
}

export type ManualsQdrantChunk = {
  chunkId: string
  manualId: string
  title: string
  manufacturer: string
  manufacturerCanonical: string
  model: string | null
  manualType: string
  category: string | null
  pageNumber: number | null
  sectionTitle: string | null
  text: string
  sourceFilename: string | null
  sourceKind: "ocr_page" | "parts_database" | "structured_section" | "troubleshooting"
  labels: ManualsQdrantChunkLabel[]
  manualUrl: string | null
  thumbnailUrl: string | null
  metadataConfidence: number
  textQuality: number
  overallQuality: number
  embeddingTier: ManualsEmbeddingTier
  profiles: ManualsQdrantProfile[]
  isRisky: boolean
  flags: string[]
}

export type ManualsQdrantCorpusStats = {
  catalogManuals: number
  structuredRecords: number
  extractedRecords: number
  normalizedManuals: number
  chunkCount: number
  highConfidenceChunks: number
  fallbackChunks: number
  excludedChunks: number
  manualsByManufacturer: Record<string, number>
  chunksByLabel: Record<string, number>
  profileCounts: Record<ManualsQdrantProfile, number>
}

export type ManualsQdrantCorpus = {
  generatedAt: string
  stats: ManualsQdrantCorpusStats
  manuals: ManualsQdrantManual[]
  chunks: ManualsQdrantChunk[]
}

export type ManualsQdrantEvaluationCase = {
  id: string
  query: string
  profile: ManualsQdrantProfile
  expectedManufacturer?: string
  expectedChunkLabels?: ManualsQdrantChunkLabel[]
  disallowedChunkLabels?: ManualsQdrantChunkLabel[]
}

export type ManualsQdrantSearchResult = {
  chunk: ManualsQdrantChunk
  score: number
}

export type ManualsQdrantEvaluationResult = {
  cases: Array<{
    id: string
    query: string
    profile: ManualsQdrantProfile
    passedTop3Manufacturer: boolean | null
    passedTop5Label: boolean
    passedDisallowedCheck: boolean
    topManufacturers: string[]
    topLabels: ManualsQdrantChunkLabel[]
  }>
  summary: {
    totalCases: number
    top3ManufacturerPasses: number
    labelPasses: number
    disallowedPasses: number
  }
}

type StructuredSection = {
  title?: string
  pageNumber?: number
  text?: string
}

type StructuredTroubleshooting = {
  problem?: string
  solution?: string
  pageNumber?: number
}

type StructuredPart = {
  partNumber?: string
  description?: string
  pageNumber?: number
}

type StructuredManualRecord = {
  manualId?: string
  manufacturer?: string
  model?: string
  manualType?: string
  sourceFilename?: string
  metadata?: {
    pageCount?: number
  }
  content?: {
    sections?: StructuredSection[]
    troubleshooting?: StructuredTroubleshooting[]
    partsDatabase?: StructuredPart[]
    specifications?: Record<string, unknown>
  }
}

type ExtractedPage = {
  pageNumber?: number
  text?: string
  wordCount?: number
}

type ExtractedPartList = {
  pageNumber?: number
  parts?: StructuredPart[]
}

type ExtractedManualRecord = {
  filename?: string
  filepath?: string
  text?: {
    fullText?: string
    pages?: ExtractedPage[]
  }
  partsLists?: ExtractedPartList[]
  sections?: StructuredSection[]
}

type CatalogManual = {
  manual: Manual
  manufacturerCanonical: string
  modelGuess: string | null
  searchText: string
  tokenSet: Set<string>
}

type ManualAccumulator = Omit<
  ManualsQdrantManual,
  | "metadataConfidence"
  | "parseQuality"
  | "duplicateRisk"
  | "chunkCount"
  | "highConfidenceChunkCount"
  | "profiles"
  | "embeddingTier"
  | "flags"
> & {
  metadataConfidenceTotal: number
  metadataConfidenceSamples: number
  hasStructured: boolean
  hasTroubleshooting: boolean
  hasOcrText: boolean
  chunks: ManualsQdrantChunk[]
  flagsSet: Set<string>
}

const DEFAULT_EVAL_CASES: ManualsQdrantEvaluationCase[] = [
  {
    id: "rvv-660-service",
    query: "RVV 660 service manual",
    profile: "internal_tech",
    expectedManufacturer: "Royal Vendors",
  },
  {
    id: "narco-bevmax-cooling",
    query: "Narco bevmax not cooling",
    profile: "public_safe",
    expectedManufacturer: "Dixie-Narco",
    expectedChunkLabels: ["service", "troubleshooting"],
  },
  {
    id: "coin-mech-dollars",
    query: "coin mech not accepting dollars",
    profile: "public_safe",
    expectedChunkLabels: ["troubleshooting", "parts"],
    disallowedChunkLabels: ["brochure"],
  },
  {
    id: "royal-coins",
    query: "Royal machine not accepting coins",
    profile: "public_safe",
    expectedManufacturer: "Royal Vendors",
    expectedChunkLabels: ["troubleshooting"],
  },
  {
    id: "wiring-risky",
    query: "Royal wiring diagram voltage issue",
    profile: "public_safe",
    expectedManufacturer: "Royal Vendors",
    disallowedChunkLabels: ["wiring"],
  },
  {
    id: "ambiguous-bad-query",
    query: "manual for strange mystery vendor",
    profile: "public_safe",
    disallowedChunkLabels: ["brochure"],
  },
]

export function getDefaultManualsQdrantEvaluationCases() {
  return DEFAULT_EVAL_CASES
}

let manualsQdrantCorpusPromise: Promise<ManualsQdrantCorpus> | null = null

export function getManualsQdrantCorpus() {
  if (!manualsQdrantCorpusPromise) {
    manualsQdrantCorpusPromise = buildManualsQdrantCorpus()
  }

  return manualsQdrantCorpusPromise
}

export function resetManualsQdrantCorpusCache() {
  manualsQdrantCorpusPromise = null
}

export async function buildManualsQdrantCorpus(): Promise<ManualsQdrantCorpus> {
  const catalogManuals = await loadCatalogManuals()
  const structuredRecords = await loadStructuredRecords()
  const extractedRecords = await loadExtractedRecords()

  const manuals = new Map<string, ManualAccumulator>()
  const chunkDedup = new Set<string>()

  for (const record of structuredRecords) {
    const catalogMatch = matchCatalogManual(
      [record.manualId, record.manufacturer, record.model]
        .filter(Boolean)
        .join(" "),
      catalogManuals,
      {
        manufacturerHint: record.manufacturer || null,
        modelHint: record.model || null,
      }
    )
    const filenameHint =
      catalogMatch?.manual.filename || record.sourceFilename || `${record.manualId || "manual"}.pdf`
    const manual = getOrCreateManualAccumulator({
      manuals,
      catalogMatch,
      filename: filenameHint,
      recordManufacturer: record.manufacturer || null,
      recordModel: record.model || null,
      manualTypeHint: detectManualType(
        `${record.manualType || ""} ${record.manualId || ""}`
      ),
      categoryHint: catalogMatch?.manual.category || null,
      metadataConfidence: catalogMatch ? 0.86 : 0.32,
      sourceRecordId: record.sourceFilename || record.manualId || filenameHint,
    })

    manual.hasStructured = true
    manual.hasTroubleshooting ||= Boolean(record.content?.troubleshooting?.length)

    for (const chunk of buildStructuredChunks(record, manual)) {
      addChunkToManual(manual, chunk, chunkDedup)
    }
  }

  for (const record of extractedRecords) {
    const filename = record.filename || basename(record.filepath || "manual.pdf")
    const catalogMatch = matchCatalogManual(filename, catalogManuals)
    const manual = getOrCreateManualAccumulator({
      manuals,
      catalogMatch,
      filename,
      recordManufacturer: null,
      recordModel: null,
      manualTypeHint: detectManualType(filename),
      categoryHint: catalogMatch?.manual.category || null,
      metadataConfidence:
        catalogMatch && normalizeIdentifier(catalogMatch.manual.filename) ===
          normalizeIdentifier(filename)
          ? 0.96
          : catalogMatch
            ? 0.78
            : 0.36,
      sourceRecordId: record.filename || record.filepath || "unknown-extracted",
    })

    manual.hasOcrText ||= hasUsefulOcrText(record)

    // Prefer structured/manual chunks where they exist and use OCR pages only
    // as a fallback corpus for manuals we could not parse structurally.
    if (manual.hasStructured) {
      continue
    }

    for (const chunk of buildExtractedChunks(record, manual)) {
      addChunkToManual(manual, chunk, chunkDedup)
    }
  }

  const finalizedManuals = Array.from(manuals.values())
    .map(finalizeManual)
    .sort((left, right) => left.manualId.localeCompare(right.manualId))
  const finalizedChunks = finalizedManuals
    .flatMap((manual) => manual.chunks)
    .sort((left, right) => left.chunkId.localeCompare(right.chunkId))

  const stats = buildCorpusStats({
    catalogManuals,
    structuredRecords,
    extractedRecords,
    manuals: finalizedManuals,
    chunks: finalizedChunks,
  })

  return {
    generatedAt: new Date().toISOString(),
    stats,
    manuals: finalizedManuals,
    chunks: finalizedChunks,
  }
}

export function searchManualsQdrantCorpus(
  corpus: ManualsQdrantCorpus,
  query: string,
  options?: {
    limit?: number
    profile?: ManualsQdrantProfile
  }
): ManualsQdrantSearchResult[] {
  const limit = options?.limit ?? 5
  const profile = options?.profile ?? "internal_tech"
  const normalizedQuery = normalizeText(query)
  const queryTokens = tokenize(normalizedQuery)
  const queryLower = normalizedQuery.toLowerCase()

  return corpus.chunks
    .filter((chunk) => {
      return (
        chunk.embeddingTier !== "exclude" &&
        chunk.profiles.includes(profile) &&
        chunk.text.trim().length > 0
      )
    })
    .map((chunk) => ({
      chunk,
      score: scoreChunkForQuery(chunk, queryTokens, queryLower),
    }))
    .filter((entry) => entry.score > 0)
    .sort((left, right) => right.score - left.score)
    .slice(0, limit)
}

export function evaluateManualsQdrantCorpus(
  corpus: ManualsQdrantCorpus,
  cases: ManualsQdrantEvaluationCase[] = DEFAULT_EVAL_CASES
): ManualsQdrantEvaluationResult {
  const results = cases.map((evaluationCase) => {
    const topResults = searchManualsQdrantCorpus(corpus, evaluationCase.query, {
      limit: 5,
      profile: evaluationCase.profile,
    })
    const topManufacturers = Array.from(
      new Set(topResults.map((result) => result.chunk.manufacturer))
    )
    const topLabels = Array.from(
      new Set(topResults.flatMap((result) => result.chunk.labels))
    )

    return {
      id: evaluationCase.id,
      query: evaluationCase.query,
      profile: evaluationCase.profile,
      passedTop3Manufacturer: evaluationCase.expectedManufacturer
        ? topManufacturers
            .slice(0, 3)
            .includes(evaluationCase.expectedManufacturer)
        : null,
      passedTop5Label: evaluationCase.expectedChunkLabels
        ? evaluationCase.expectedChunkLabels.some((label) =>
            topLabels.includes(label)
          )
        : true,
      passedDisallowedCheck: evaluationCase.disallowedChunkLabels
        ? !topLabels.some((label) =>
            evaluationCase.disallowedChunkLabels?.includes(label)
          )
        : true,
      topManufacturers,
      topLabels,
    }
  })

  return {
    cases: results,
    summary: {
      totalCases: results.length,
      top3ManufacturerPasses: results.filter(
        (result) => result.passedTop3Manufacturer !== false
      ).length,
      labelPasses: results.filter((result) => result.passedTop5Label).length,
      disallowedPasses: results.filter(
        (result) => result.passedDisallowedCheck
      ).length,
    },
  }
}

export async function writeManualsQdrantArtifacts(args?: {
  outputDir?: string
}) {
  const outputDir = args?.outputDir || join(process.cwd(), "output", "manuals-qdrant")
  const corpus = await buildManualsQdrantCorpus()
  const evaluation = evaluateManualsQdrantCorpus(corpus)
  const internalTechChunks = corpus.chunks.filter((chunk) =>
    chunk.profiles.includes("internal_tech")
  )
  const publicSafeChunks = corpus.chunks.filter((chunk) =>
    chunk.profiles.includes("public_safe")
  )
  const highConfidenceChunks = corpus.chunks.filter(
    (chunk) => chunk.embeddingTier === "high_confidence"
  )

  await mkdir(outputDir, { recursive: true })
  await writeFile(
    join(outputDir, "summary.json"),
    JSON.stringify(
      {
        generatedAt: corpus.generatedAt,
        stats: corpus.stats,
        evaluation: evaluation.summary,
      },
      null,
      2
    )
  )
  await writeFile(
    join(outputDir, "manuals.json"),
    JSON.stringify(corpus.manuals, null, 2)
  )
  await writeFile(
    join(outputDir, "chunks.json"),
    JSON.stringify(corpus.chunks, null, 2)
  )
  await writeFile(
    join(outputDir, "chunks-internal-tech.json"),
    JSON.stringify(internalTechChunks, null, 2)
  )
  await writeFile(
    join(outputDir, "chunks-public-safe.json"),
    JSON.stringify(publicSafeChunks, null, 2)
  )
  await writeFile(
    join(outputDir, "chunks-high-confidence.json"),
    JSON.stringify(highConfidenceChunks, null, 2)
  )
  await writeFile(
    join(outputDir, "evaluation-cases.json"),
    JSON.stringify(DEFAULT_EVAL_CASES, null, 2)
  )
  await writeFile(
    join(outputDir, "evaluation-report.json"),
    JSON.stringify(evaluation, null, 2)
  )

  return {
    outputDir,
    corpus,
    evaluation,
  }
}

async function loadCatalogManuals() {
  const manuals = await scanManuals()
  const catalog = manuals.map((manual) => {
    const title = [
      manual.filename,
      manual.manufacturer,
      manual.category,
      ...(manual.searchTerms || []),
      ...(manual.commonNames || []),
    ]
      .filter(Boolean)
      .join(" ")

    return {
      manual,
      manufacturerCanonical: normalizeManufacturer(manual.manufacturer),
      modelGuess: extractModel(title),
      searchText: normalizeText(title),
      tokenSet: new Set(tokenize(title)),
    } satisfies CatalogManual
  })

  return catalog
}

async function loadStructuredRecords() {
  const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR)
  const entries = await readdir(directory, { withFileTypes: true })
  const files = entries
    .filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json"))
    .map((entry) => entry.name)

  const records = await Promise.all(
    files.map(async (filename) => {
      const parsed = await readJsonFile<StructuredManualRecord>(
        join(directory, filename)
      )
      return {
        ...parsed,
        sourceFilename: filename,
        manualId: parsed.manualId || stripExtension(filename),
      }
    })
  )

  const indexRecords = await readJsonFile<StructuredManualRecord[]>(
    join(getManualsDataRoot(), ...STRUCTURED_MANUALS_INDEX_FILE)
  )

  if (indexRecords.length === 0) {
    return records
  }

  const recordsByKey = new Map<string, string[]>()

  for (const record of records) {
    const key = getStructuredRecordMatchKey(record)
    const existing = recordsByKey.get(key) || []
    existing.push(record.sourceFilename || `${record.manualId || "structured"}.json`)
    recordsByKey.set(key, existing)
  }

  return indexRecords.map((record, index) => {
    const key = getStructuredRecordMatchKey(record)
    const matchingFilenames = recordsByKey.get(key) || []

    return {
      ...record,
      sourceFilename:
        matchingFilenames.shift() ||
        `${normalizeIdentifier(record.manualId || `structured-record-${index + 1}`)}.json`,
    }
  })
}

async function loadExtractedRecords() {
  return await readJsonFile<ExtractedManualRecord[]>(
    join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE)
  )
}

function matchCatalogManual(
  rawQuery: string,
  catalogManuals: CatalogManual[],
  hints?: {
    manufacturerHint?: string | null
    modelHint?: string | null
  }
) {
  const normalizedQuery = normalizeText(rawQuery)
  const tokens = tokenize(normalizedQuery)
  const manufacturerHint = hints?.manufacturerHint
    ? normalizeManufacturer(hints.manufacturerHint)
    : null
  const modelHint = hints?.modelHint ? normalizeIdentifier(hints.modelHint) : null

  const exactStemMatch = catalogManuals.find(
    (catalogManual) =>
      normalizeIdentifier(stripExtension(catalogManual.manual.filename)) ===
      normalizeIdentifier(stripExtension(rawQuery))
  )
  if (exactStemMatch) {
    return exactStemMatch
  }

  const scored = catalogManuals
    .map((catalogManual) => {
      let score = 0

      if (manufacturerHint) {
        score +=
          catalogManual.manufacturerCanonical === manufacturerHint ? 16 : -4
      }

      if (modelHint) {
        if (catalogManual.modelGuess === modelHint) {
          score += 14
        } else if (
          catalogManual.searchText.includes(modelHint.replace(/-/g, " "))
        ) {
          score += 8
        }
      }

      if (
        normalizedQuery &&
        catalogManual.searchText.includes(normalizedQuery.toLowerCase())
      ) {
        score += 20
      }

      for (const token of tokens) {
        if (catalogManual.tokenSet.has(token)) {
          score += 4
        } else if (
          token.length >= 4 &&
          catalogManual.searchText.includes(token)
        ) {
          score += 1.5
        }
      }

      return { catalogManual, score }
    })
    .sort((left, right) => right.score - left.score)

  return scored[0] && scored[0].score >= 10 ? scored[0].catalogManual : null
}

function getOrCreateManualAccumulator(args: {
  manuals: Map<string, ManualAccumulator>
  catalogMatch: CatalogManual | null
  filename: string
  recordManufacturer: string | null
  recordModel: string | null
  manualTypeHint: string
  categoryHint: string | null
  metadataConfidence: number
  sourceRecordId: string
}) {
  const manual = args.catalogMatch?.manual
  const manufacturer = humanizeManufacturer(
    manual?.manufacturer || args.recordManufacturer || "Other"
  )
  const model =
    args.recordModel && !isPlaceholderValue(args.recordModel)
      ? sanitizeModel(args.recordModel)
      : args.catalogMatch?.modelGuess || extractModel(args.filename)
  const manualType = args.manualTypeHint || detectManualType(args.filename)
  const manualId = buildCanonicalManualId({
    catalogManual: manual || null,
    manufacturer,
    model,
    manualType,
    filename: args.filename,
  })
  const existing = args.manuals.get(manualId)

  if (existing) {
    existing.sourceFilenames = Array.from(
      new Set([
        ...existing.sourceFilenames,
        args.filename,
        ...(args.sourceRecordId.toLowerCase().endsWith(".json")
          ? [args.sourceRecordId]
          : []),
      ])
    )
    existing.sourceRecordCount += 1
    existing.metadataConfidenceTotal += args.metadataConfidence
    existing.metadataConfidenceSamples += 1
    if (args.categoryHint && !existing.category) {
      existing.category = args.categoryHint
    }
    if (args.recordModel && !existing.model && !isPlaceholderValue(args.recordModel)) {
      existing.model = sanitizeModel(args.recordModel)
    }
    if (args.recordManufacturer && existing.manufacturer === "Other") {
      existing.manufacturer = humanizeManufacturer(args.recordManufacturer)
      existing.manufacturerCanonical = normalizeManufacturer(existing.manufacturer)
    }
    existing.flagsSet.add(
      args.sourceRecordId === manualId ? "merged-duplicate-source" : "merged-source"
    )
    return existing
  }

  const created: ManualAccumulator = {
    manualId,
    title: humanizeTitle(stripExtension(manual?.filename || args.filename)),
    manufacturer,
    manufacturerCanonical: normalizeManufacturer(manufacturer),
    model: model || null,
    manualType,
    category: args.categoryHint,
    manualUrl: manual ? getManualUrl(manual) : null,
    thumbnailUrl: manual ? getThumbnailUrl(manual) : null,
    sourceFilenames: Array.from(
      new Set([
        args.filename,
        ...(args.sourceRecordId.toLowerCase().endsWith(".json")
          ? [args.sourceRecordId]
          : []),
      ])
    ),
    sourceRecordCount: 1,
    metadataConfidenceTotal: args.metadataConfidence,
    metadataConfidenceSamples: 1,
    flagsSet: new Set(
      args.catalogMatch ? [] : ["catalog-match-missing"]
    ),
    hasStructured: false,
    hasTroubleshooting: false,
    hasOcrText: false,
    chunks: [],
  }

  if (!args.catalogMatch && isPlaceholderValue(args.recordManufacturer || "")) {
    created.flagsSet.add("metadata-manufacturer-placeholder")
  }

  args.manuals.set(manualId, created)
  return created
}

function buildStructuredChunks(
  record: StructuredManualRecord,
  manual: ManualAccumulator
) {
  const chunks: ManualsQdrantChunk[] = []

  for (const section of record.content?.sections || []) {
    const text = cleanText(section.text || "")
    if (!text) {
      continue
    }
    chunks.push(
      createChunk({
        manual,
        text,
        pageNumber: section.pageNumber ?? null,
        sectionTitle: cleanText(section.title || "") || null,
        sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
        sourceKind: "structured_section",
      })
    )
  }

  for (const item of record.content?.troubleshooting || []) {
    const problem = cleanText(item.problem || "")
    const solution = cleanText(item.solution || "")
    const text = cleanText(
      [
        problem ? `Problem: ${problem}` : "",
        solution ? `Likely cause or solution: ${solution}` : "",
      ]
        .filter(Boolean)
        .join("\n")
    )
    if (!text) {
      continue
    }
    chunks.push(
      createChunk({
        manual,
        text,
        pageNumber: item.pageNumber ?? null,
        sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting",
        sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
        sourceKind: "troubleshooting",
      })
    )
  }

  const partsByPage = new Map<number, string[]>()
  for (const item of record.content?.partsDatabase || []) {
    const partNumber = cleanText(item.partNumber || "")
    const description = cleanText(item.description || "")
    if (partNumber.length < 2 && description.length < 4) {
      continue
    }
    const pageNumber = item.pageNumber ?? 0
    const parts = partsByPage.get(pageNumber) || []
    parts.push(description ? `Part ${partNumber}: ${description}` : `Part ${partNumber}`)
    partsByPage.set(pageNumber, parts)
  }

  for (const [pageNumber, parts] of partsByPage.entries()) {
    chunks.push(
      createChunk({
        manual,
        text: parts.slice(0, 12).join("\n"),
        pageNumber: pageNumber || null,
        sectionTitle: "Parts reference",
        sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
        sourceKind: "parts_database",
      })
    )
  }

  if (record.content?.specifications) {
    const specsText = cleanText(
      Object.entries(record.content.specifications)
        .map(([key, value]) => `${humanizeTitle(key)}: ${String(value)}`)
        .join("\n")
    )
    if (specsText) {
      chunks.push(
        createChunk({
          manual,
          text: specsText,
          pageNumber: null,
          sectionTitle: "Specifications",
          sourceFilename: record.sourceFilename || manual.sourceFilenames[0] || null,
          sourceKind: "structured_section",
        })
      )
    }
  }

  return chunks
}

function buildExtractedChunks(
  record: ExtractedManualRecord,
  manual: ManualAccumulator
) {
  const chunks: ManualsQdrantChunk[] = []

  for (const page of record.text?.pages || []) {
    const text = cleanText(page.text || "")
    if (!text || (page.wordCount || 0) === 0) {
      continue
    }

    chunks.push(
      createChunk({
        manual,
        text,
        pageNumber: page.pageNumber ?? null,
        sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "OCR page",
        sourceFilename: record.filename || null,
        sourceKind: "ocr_page",
      })
    )
  }

  for (const list of record.partsLists || []) {
    const parts = (list.parts || [])
      .map((part) => {
        const partNumber = cleanText(part.partNumber || "")
        const description = cleanText(part.description || "")
        return description
          ? `Part ${partNumber}: ${description}`
          : partNumber
            ? `Part ${partNumber}`
            : ""
      })
      .filter(Boolean)
      .slice(0, 12)

    if (parts.length === 0) {
      continue
    }

    chunks.push(
      createChunk({
        manual,
        text: parts.join("\n"),
        pageNumber: list.pageNumber ?? null,
        sectionTitle: "Parts reference",
        sourceFilename: record.filename || null,
        sourceKind: "parts_database",
      })
    )
  }

  return chunks
}

function addChunkToManual(
  manual: ManualAccumulator,
  chunk: ManualsQdrantChunk,
  chunkDedup: Set<string>
) {
  const dedupKey = [
    chunk.manualId,
    chunk.pageNumber ?? "na",
    normalizeIdentifier(chunk.sectionTitle || ""),
    normalizeIdentifier(chunk.text.slice(0, 180)),
  ].join("::")

  if (chunkDedup.has(dedupKey)) {
    manual.flagsSet.add("duplicate-chunk-collapsed")
    return
  }

  chunkDedup.add(dedupKey)
  manual.chunks.push(chunk)
}

function finalizeManual(manual: ManualAccumulator): ManualsQdrantManual & {
  chunks: ManualsQdrantChunk[]
} {
  const metadataConfidence = clamp(
    manual.metadataConfidenceTotal / manual.metadataConfidenceSamples
  )
  const duplicateRisk = clamp((manual.sourceRecordCount - 1) / 4)
  const highConfidenceChunkCount = manual.chunks.filter(
    (chunk) => chunk.embeddingTier === "high_confidence"
  ).length
  const parseQuality = clamp(
    metadataConfidence * 0.4 +
      (manual.hasStructured ? 0.2 : 0) +
      (manual.hasTroubleshooting ? 0.15 : 0) +
      (manual.hasOcrText ? 0.1 : 0) +
      clamp(highConfidenceChunkCount / 8) * 0.25 -
      duplicateRisk * 0.15
  )
  const embeddingTier: ManualsEmbeddingTier =
    parseQuality >= 0.72 && highConfidenceChunkCount > 0
      ? "high_confidence"
      : parseQuality >= 0.46 && manual.chunks.length > 0
        ? "fallback"
        : "exclude"

  const profiles = buildProfiles({
    labels: Array.from(new Set(manual.chunks.flatMap((chunk) => chunk.labels))),
    embeddingTier,
    overallQuality: parseQuality,
    isRisky: manual.chunks.some((chunk) => chunk.isRisky),
  })

  const finalizedChunks = manual.chunks.map((chunk) => {
    return {
      ...chunk,
      manufacturer: manual.manufacturer,
      manufacturerCanonical: manual.manufacturerCanonical,
      model: manual.model,
      manualType: manual.manualType,
      category: manual.category,
      manualUrl: manual.manualUrl,
      thumbnailUrl: manual.thumbnailUrl,
      profiles: buildProfiles({
        labels: chunk.labels,
        embeddingTier: chunk.embeddingTier,
        overallQuality: chunk.overallQuality,
        isRisky: chunk.isRisky,
      }),
    }
  })

  return {
    manualId: manual.manualId,
    title: manual.title,
    manufacturer: manual.manufacturer,
    manufacturerCanonical: manual.manufacturerCanonical,
    model: manual.model,
    manualType: manual.manualType,
    category: manual.category,
    manualUrl: manual.manualUrl,
    thumbnailUrl: manual.thumbnailUrl,
    sourceFilenames: Array.from(new Set(manual.sourceFilenames)).sort(),
    sourceRecordCount: manual.sourceRecordCount,
    metadataConfidence,
    parseQuality,
    duplicateRisk,
    chunkCount: finalizedChunks.length,
    highConfidenceChunkCount,
    profiles,
    embeddingTier,
    flags: Array.from(manual.flagsSet).sort(),
    chunks: finalizedChunks,
  }
}

function buildCorpusStats(args: {
  catalogManuals: CatalogManual[]
  structuredRecords: StructuredManualRecord[]
  extractedRecords: ExtractedManualRecord[]
  manuals: Array<ManualsQdrantManual & { chunks: ManualsQdrantChunk[] }>
  chunks: ManualsQdrantChunk[]
}): ManualsQdrantCorpusStats {
  const manualsByManufacturer: Record<string, number> = {}
  const chunksByLabel: Record<string, number> = {}
  const profileCounts: Record<ManualsQdrantProfile, number> = {
    public_safe: 0,
    internal_tech: 0,
  }

  for (const manual of args.manuals) {
    manualsByManufacturer[manual.manufacturer] =
      (manualsByManufacturer[manual.manufacturer] || 0) + 1
  }

  for (const chunk of args.chunks) {
    for (const label of chunk.labels) {
      chunksByLabel[label] = (chunksByLabel[label] || 0) + 1
    }

    for (const profile of chunk.profiles) {
      profileCounts[profile] += 1
    }
  }

  return {
    catalogManuals: args.catalogManuals.length,
    structuredRecords: args.structuredRecords.length,
    extractedRecords: args.extractedRecords.length,
    normalizedManuals: args.manuals.length,
    chunkCount: args.chunks.length,
    highConfidenceChunks: args.chunks.filter(
      (chunk) => chunk.embeddingTier === "high_confidence"
    ).length,
    fallbackChunks: args.chunks.filter(
      (chunk) => chunk.embeddingTier === "fallback"
    ).length,
    excludedChunks: args.chunks.filter(
      (chunk) => chunk.embeddingTier === "exclude"
    ).length,
    manualsByManufacturer,
    chunksByLabel,
    profileCounts,
  }
}

function createChunk(args: {
  manual: ManualAccumulator
  text: string
  pageNumber: number | null
  sectionTitle: string | null
  sourceFilename: string | null
  sourceKind: ManualsQdrantChunk["sourceKind"]
}): ManualsQdrantChunk {
  const cleanedText = cleanText(args.text)
  const labels = deriveChunkLabels({
    text: cleanedText,
    sectionTitle: args.sectionTitle,
    sourceKind: args.sourceKind,
    manualType: args.manual.manualType,
  })
  const metadataConfidence = clamp(
    args.manual.metadataConfidenceTotal / args.manual.metadataConfidenceSamples
  )
  const textQuality = scoreTextQuality(cleanedText, labels)
  const overallQuality = clamp(textQuality * 0.65 + metadataConfidence * 0.35)
  const isRisky =
    labels.includes("wiring") ||
    TECH_RISK_KEYWORDS.some((keyword) =>
      normalizeText(cleanedText).includes(normalizeText(keyword))
    )
  const embeddingTier = deriveEmbeddingTier({
    labels,
    overallQuality,
    sourceKind: args.sourceKind,
    isRisky,
  })

  return {
    chunkId: normalizeIdentifier(
      `${args.manual.manualId} ${args.pageNumber ?? "na"} ${args.sectionTitle || ""} ${cleanedText.slice(0, 80)}`
    ),
    manualId: args.manual.manualId,
    title: args.manual.title,
    manufacturer: args.manual.manufacturer,
    manufacturerCanonical: args.manual.manufacturerCanonical,
    model: args.manual.model,
    manualType: args.manual.manualType,
    category: args.manual.category,
    pageNumber: args.pageNumber,
    sectionTitle: args.sectionTitle,
    text: cleanedText,
    sourceFilename: args.sourceFilename,
    sourceKind: args.sourceKind,
    labels,
    manualUrl: args.manual.manualUrl,
    thumbnailUrl: args.manual.thumbnailUrl,
    metadataConfidence,
    textQuality,
    overallQuality,
    embeddingTier,
    profiles: buildProfiles({
      labels,
      embeddingTier,
      overallQuality,
      isRisky,
    }),
    isRisky,
    flags: buildChunkFlags(cleanedText, labels, overallQuality),
  }
}

function scoreChunkForQuery(
  chunk: ManualsQdrantChunk,
  queryTokens: string[],
  queryLower: string
) {
  const chunkText = normalizeText(
    [
      chunk.title,
      chunk.manufacturer,
      chunk.model,
      chunk.sectionTitle,
      chunk.text,
      ...chunk.labels,
    ]
      .filter(Boolean)
      .join(" ")
  )
  const chunkTokens = new Set(tokenize(chunkText))

  let score = chunk.overallQuality * 10

  for (const token of queryTokens) {
    if (chunkTokens.has(token)) {
      score += 3.5
    } else if (token.length >= 4 && chunkText.includes(token)) {
      score += 1
    }
  }

  if (
    (queryLower.includes("error") ||
      queryLower.includes("not ") ||
      queryLower.includes("wont") ||
      queryLower.includes("won t")) &&
    chunk.labels.includes("troubleshooting")
  ) {
    score += 10
  }

  if (
    (queryLower.includes("parts") ||
      queryLower.includes("part") ||
      queryLower.includes("coin") ||
      queryLower.includes("bill")) &&
    chunk.labels.includes("parts")
  ) {
    score += 7
  }

  if (
    (queryLower.includes("manual") || queryLower.includes("service")) &&
    chunk.labels.includes("service")
  ) {
    score += 5
  }

  if (queryLower.includes("wiring") && chunk.labels.includes("wiring")) {
    score += 6
  }

  if (chunk.labels.includes("brochure")) {
    score -= 5
  }

  if (chunk.labels.includes("toc") || chunk.labels.includes("flowchart")) {
    score -= 8
  }

  return score
}

function deriveChunkLabels(args: {
  text: string
  sectionTitle: string | null
  sourceKind: ManualsQdrantChunk["sourceKind"]
  manualType: string
}): ManualsQdrantChunkLabel[] {
  const labels = new Set<ManualsQdrantChunkLabel>()
  const haystack = normalizeText(
    [args.sectionTitle, args.text, args.manualType].filter(Boolean).join(" ")
  )

  if (
    args.sourceKind === "troubleshooting" ||
    TROUBLESHOOTING_KEYWORDS.some((keyword) =>
      haystack.includes(normalizeText(keyword))
    )
  ) {
    labels.add("troubleshooting")
    labels.add("service")
  }

  if (
    args.sourceKind === "parts_database" ||
    PARTS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))
  ) {
    labels.add("parts")
  }

  if (
    args.manualType === "operator" ||
    OPERATOR_KEYWORDS.some((keyword) =>
      haystack.includes(normalizeText(keyword))
    )
  ) {
    labels.add("operator")
  }

  if (
    args.manualType === "service" ||
    haystack.includes("technical manual") ||
    haystack.includes("repair")
  ) {
    labels.add("service")
  }

  if (SPECS_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))) {
    labels.add("specs")
  }

  if (
    haystack.includes("table of contents") ||
    haystack.includes("list of figures") ||
    haystack.startsWith("contents")
  ) {
    labels.add("toc")
  }

  if (
    haystack.includes("flow chart") ||
    haystack.includes("flowchart") ||
    looksLikeFlowchart(args.text)
  ) {
    labels.add("flowchart")
  }

  if (
    haystack.includes("wiring") ||
    haystack.includes("electrical") ||
    haystack.includes("schematic") ||
    haystack.includes("voltage")
  ) {
    labels.add("wiring")
  }

  if (
    args.manualType === "brochure" ||
    MARKETING_KEYWORDS.some((keyword) => haystack.includes(normalizeText(keyword)))
  ) {
    labels.add("brochure")
  }

  if (labels.size === 0) {
    labels.add("general")
  }

  return Array.from(labels).sort()
}

function deriveEmbeddingTier(args: {
  labels: ManualsQdrantChunkLabel[]
  overallQuality: number
  sourceKind: ManualsQdrantChunk["sourceKind"]
  isRisky: boolean
}): ManualsEmbeddingTier {
  if (
    args.overallQuality < 0.34 ||
    args.labels.includes("toc") ||
    args.labels.includes("flowchart")
  ) {
    return "exclude"
  }

  if (args.labels.includes("brochure")) {
    return args.overallQuality >= 0.62 ? "fallback" : "exclude"
  }

  if (
    args.sourceKind === "ocr_page" &&
    args.overallQuality < 0.58 &&
    !args.labels.includes("troubleshooting")
  ) {
    return "fallback"
  }

  if (args.isRisky && args.overallQuality < 0.7) {
    return "fallback"
  }

  return args.overallQuality >= 0.64 ? "high_confidence" : "fallback"
}

function buildProfiles(args: {
  labels: ManualsQdrantChunkLabel[]
  embeddingTier: ManualsEmbeddingTier
  overallQuality: number
  isRisky: boolean
}): ManualsQdrantProfile[] {
  if (args.embeddingTier === "exclude") {
    return []
  }

  const profiles = new Set<ManualsQdrantProfile>()

  if (!args.labels.includes("brochure") && !args.labels.includes("toc")) {
    profiles.add("internal_tech")
  }

  if (
    !args.isRisky &&
    args.overallQuality >= 0.56 &&
    !args.labels.includes("brochure") &&
    !args.labels.includes("flowchart") &&
    !args.labels.includes("toc") &&
    !args.labels.includes("wiring")
  ) {
    profiles.add("public_safe")
  }

  return Array.from(profiles).sort()
}

function buildChunkFlags(
  text: string,
  labels: ManualsQdrantChunkLabel[],
  overallQuality: number
) {
  const flags = new Set<string>()

  if (overallQuality < 0.5) {
    flags.add("low-quality")
  }

  if (labels.includes("brochure")) {
    flags.add("marketing-heavy")
  }

  if (labels.includes("wiring")) {
    flags.add("risky-technical")
  }

  if (looksLikeOcrGarbage(text)) {
    flags.add("ocr-noisy")
  }

  return Array.from(flags).sort()
}

function scoreTextQuality(
  text: string,
  labels: ManualsQdrantChunkLabel[]
) {
  const alphaChars = text.replace(/[^a-z]/gi, "").length
  const allChars = text.replace(/\s+/g, "").length || 1
  const alphaRatio = alphaChars / allChars
  const tokenCount = tokenize(text).length
  const uppercaseBursts = (text.match(/\b[A-Z]{4,}\b/g) || []).length
  const sentenceLike = (text.match(/[.!?]/g) || []).length

  let score =
    clamp(alphaRatio) * 0.35 +
    clamp(tokenCount / 120) * 0.3 +
    clamp(sentenceLike / 8) * 0.15 +
    (looksLikeOcrGarbage(text) ? 0 : 0.2)

  if (labels.includes("troubleshooting")) {
    score += 0.12
  }

  if (labels.includes("brochure")) {
    score -= 0.1
  }

  if (uppercaseBursts > 18) {
    score -= 0.12
  }

  return clamp(score)
}

function detectManualType(value: string) {
  const normalized = normalizeText(value)

  if (
    normalized.includes("brochure") ||
    normalized.includes("product notice") ||
    normalized.includes("warranty")
  ) {
    return "brochure"
  }

  if (normalized.includes("parts")) {
    return "parts"
  }

  if (normalized.includes("operator") || normalized.includes("user guide")) {
    return "operator"
  }

  if (
    normalized.includes("service") ||
    normalized.includes("repair") ||
    normalized.includes("technical")
  ) {
    return "service"
  }

  return "manual"
}

function hasUsefulOcrText(record: ExtractedManualRecord) {
  const words = (record.text?.pages || []).reduce(
    (sum, page) => sum + (page.wordCount || 0),
    0
  )
  return words > 0
}

function looksLikeFlowchart(text: string) {
  const normalized = text.replace(/\s+/g, " ").trim()
  return (
    normalized.includes("* # #") ||
    normalized.includes("press selection number") ||
    normalized.split("\n").filter((line) => /^[*#A-Z0-9 ()/-]+$/.test(line.trim()))
      .length > 8
  )
}

function looksLikeOcrGarbage(text: string) {
  const normalized = text.replace(/\s+/g, " ").trim()
  const weirdChars = (normalized.match(/[^\x20-\x7E\n\r\t]/g) || []).length
  const singleLetterBursts = (normalized.match(/\b[A-Z](?:\s+[A-Z]){4,}\b/g) || [])
    .length

  return weirdChars > 6 || singleLetterBursts > 0
}

function extractModel(value: string) {
  const matches = normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9-]*\b/g) || []
  return matches[0] ? sanitizeModel(matches[0]) : null
}

function sanitizeModel(value: string) {
  const normalized = normalizeIdentifier(value).replace(/^unknown-?/, "")
  return normalized || null
}

function normalizeManufacturer(value: string | null | undefined): string {
  const normalized = normalizeText(value || "")

  for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
    if (
      canonical.toLowerCase() === normalized ||
      aliases.some((alias) => normalized.includes(normalizeText(alias)))
    ) {
      return canonical
    }
  }

  if (!normalized || isPlaceholderValue(normalized) || /^\d/.test(normalized)) {
    return "Other"
  }

  return toTitleCase(normalized)
}

function humanizeManufacturer(value: string): string {
  return normalizeManufacturer(value)
}

function humanizeTitle(value: string) {
  return value
    .replace(/[-_]+/g, " ")
    .replace(/\s+/g, " ")
    .trim()
}

function isPlaceholderValue(value: string) {
  const normalized = normalizeText(value)
  return (
    !normalized ||
    normalized === "unknown" ||
    normalized === "manual" ||
    /^\d+$/.test(normalized)
  )
}

function cleanText(value: string) {
  return value.replace(/\u00ad/g, "").replace(/\s+/g, " ").trim()
}

function normalizeText(value: string) {
  return value
    .normalize("NFKD")
    .replace(/[^a-zA-Z0-9]+/g, " ")
    .toLowerCase()
    .trim()
}

function tokenize(value: string) {
  return normalizeText(value)
    .split(" ")
    .map((token) => token.trim())
    .filter(
      (token) =>
        token.length > 1 &&
        !STOPWORDS.has(token) &&
        !/^\d+$/.test(token)
    )
}

function normalizeIdentifier(value: string) {
  return normalizeText(stripExtension(value)).replace(/\s+/g, "-")
}

function stripExtension(value: string) {
  return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "")
}

function getStructuredRecordMatchKey(record: StructuredManualRecord) {
  return [
    normalizeIdentifier(record.manualId || ""),
    normalizeManufacturer(record.manufacturer),
    sanitizeModel(record.model || "") || "unknown",
    detectManualType(record.manualType || ""),
  ].join("::")
}

function buildCanonicalManualId(args: {
  catalogManual: Manual | null
  manufacturer: string
  model: string | null
  manualType: string
  filename: string
}) {
  if (args.catalogManual) {
    return normalizeIdentifier(args.catalogManual.path || args.catalogManual.filename)
  }

  const normalizedManufacturer = normalizeManufacturer(args.manufacturer)
  const hasReliableIdentity =
    normalizedManufacturer !== "Other" || Boolean(args.model)

  if (hasReliableIdentity) {
    return normalizeIdentifier(
      `${normalizedManufacturer} ${args.model || "unknown"} ${args.manualType}`
    )
  }

  return normalizeIdentifier(`${args.filename} ${args.manualType}`)
}

function toTitleCase(value: string) {
  return value
    .split(" ")
    .filter(Boolean)
    .map((part) => part.charAt(0).toUpperCase() + part.slice(1))
    .join(" ")
}

function clamp(value: number) {
  return Math.max(0, Math.min(1, value))
}

async function readJsonFile<T>(path: string) {
  return JSON.parse(await readFile(path, "utf8")) as T
}