Rocky_Mountain_Vending/lib/manuals-knowledge.ts

import { existsSync } from "node:fs"
import { readFile, readdir } from "node:fs/promises"
import { basename, join } from "node:path"
import { listConvexManuals } from "@/lib/convex-service"
import { scanManuals } from "@/lib/manuals"
import { getManualsDataRoot } from "@/lib/manuals-paths"
import {
  buildManualAssetUrl,
  buildThumbnailAssetUrl,
} from "@/lib/manuals-storage"
import type { Manual } from "@/lib/manuals-types"
import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types"

const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"]
const STRUCTURED_MANUALS_DIR = [
  ...MANUALS_OPTIMIZED_ROOT,
  "training_data",
  "structured",
]
const EXTRACTED_CONTENT_FILE = [
  ...MANUALS_OPTIMIZED_ROOT,
  "extracted_content",
  "manuals_extracted_content.json",
]
const DEFAULT_MANUALS_PLATFORM_TENANT_ID = "rocky-mountain-vending"

const STOPWORDS = new Set([
  "a",
  "an",
  "and",
  "are",
  "at",
  "be",
  "by",
  "for",
  "from",
  "help",
  "how",
  "i",
  "in",
  "is",
  "it",
  "machine",
  "manual",
  "me",
  "my",
  "of",
  "on",
  "or",
  "our",
  "please",
  "service",
  "that",
  "the",
  "this",
  "to",
  "up",
  "with",
])

const RISKY_MANUAL_KEYWORDS = [
  "wiring",
  "diagram",
  "voltage",
  "compressor",
  "refrigerant",
  "bypass",
  "jumper",
  "board level",
  "schematic",
  "electrical",
  "rewire",
  "disassemble",
  "tear down",
]

const MANUAL_QUERY_HINTS = [
  "manual",
  "model",
  "serial",
  "error",
  "code",
  "parts",
  "part",
  "troubleshoot",
  "troubleshooting",
  "not cooling",
  "not vending",
  "coin",
  "bill acceptor",
  "bill",
  "coin mech",
  "validator",
  "jam",
  "stuck",
  "door",
  "display",
  "keypad",
  "compressor",
  "motor",
  "sensor",
]

const MANUFACTURER_ALIASES: Record<string, string[]> = {
  "automatic products": ["automatic products", "automatic-products", "ap"],
  "coinco": ["coinco"],
  "crane": [
    "crane",
    "national",
    "national vendors",
    "merchant",
    "merchant series",
    "shopper",
  ],
  "dixie narco": [
    "dixie",
    "narco",
    "dixie narco",
    "dixie-narco",
    "dn",
    "bevmax",
  ],
  "gpl": ["gpl", "general products"],
  "mei mars": ["mei", "mars", "mei mars", "bill validator"],
  "royal vendors": ["royal", "royal vendors", "royal vendor", "rvv"],
  "rowe": ["rowe"],
  "seaga": ["seaga"],
  "usi": ["usi", "u select it", "u-select-it", "uselectit"],
  "vendo": ["vendo", "sanden"],
}

export type ManualCandidate = {
  manualId: string
  filename: string
  manufacturer: string
  category: string
  manualUrl: string | null
  thumbnailUrl: string | null
  score: number
  confidence: number
}

export type ManualKnowledgeChunk = {
  manualId: string
  filename: string
  manufacturer: string
  model: string | null
  manualType: string
  pageNumber: number | null
  sectionTitle: string | null
  text: string
  manualUrl: string | null
  thumbnailUrl: string | null
  sourceConfidence: number
  matchScore: number
  citation: string
}

export type ManualKnowledgeFilters = {
  manufacturer?: string | null
  manualId?: string | null
  model?: string | null
}

export type RetrieveManualContextResult = {
  query: string
  bestManual: ManualCandidate | null
  manualCandidates: ManualCandidate[]
  topChunks: ManualKnowledgeChunk[]
  needsClarification: boolean
  isRisky: boolean
}

export type ManualCitationContext = {
  manual: ManualCandidate | null
  citations: ManualKnowledgeChunk[]
}

export type ManualRetrievalSummary = {
  ran: boolean
  query: string
  bestManualId: string | null
  manualCandidateIds: string[]
  topChunkCitations: string[]
  needsClarification: boolean | null
  isRisky: boolean | null
  error: string | null
}

export interface ManualKnowledgeProvider {
  findManualCandidates(query: string): Promise<ManualCandidate[]>
  retrieveManualContext(
    query: string,
    filters?: ManualKnowledgeFilters
  ): Promise<RetrieveManualContextResult>
  getManualCitationContext(
    manualId: string,
    pageNumber?: number
  ): Promise<ManualCitationContext>
}

type StructuredSection = {
  title?: string
  pageNumber?: number
  text?: string
}

type StructuredTroubleshooting = {
  problem?: string
  solution?: string
  pageNumber?: number
}

type StructuredPart = {
  partNumber?: string
  description?: string
  pageNumber?: number
}

type StructuredManualRecord = {
  manualId?: string
  manufacturer?: string
  model?: string
  manualType?: string
  content?: {
    sections?: StructuredSection[]
    troubleshooting?: StructuredTroubleshooting[]
    partsDatabase?: StructuredPart[]
  }
}

type ExtractedPage = {
  pageNumber?: number
  text?: string
  wordCount?: number
}

type ExtractedPartList = {
  pageNumber?: number
  parts?: StructuredPart[]
}

type ExtractedManualRecord = {
  filename?: string
  sections?: StructuredSection[]
  partsLists?: ExtractedPartList[]
  text?: {
    fullText?: string
    pages?: ExtractedPage[]
  }
}

type SharedPlatformManualRecord = {
  manualId: string
  filename: string
  manufacturer: string
  category: string | null
  model: string | null
  manualType: string
  sourcePath?: string | null
  manualUrl?: string | null
  thumbnailUrl?: string | null
  sourceFilenames?: string[]
}

type SharedPlatformChunkRecord = {
  manualId: string
  title?: string
  manufacturer: string
  model: string | null
  manualType: string
  pageNumber: number | null
  sectionTitle: string | null
  text: string
  manualUrl?: string | null
  thumbnailUrl?: string | null
  metadataConfidence?: number
  overallQuality?: number
}

type PreparedQuery = {
  normalized: string
  tokens: string[]
  modelTokens: string[]
  manufacturerFilters: string[]
  manualIdFilter: string | null
}

type ManualInternal = {
  manualId: string
  filename: string
  manufacturer: string
  category: string
  model: string | null
  manualType: string
  manualUrl: string | null
  thumbnailUrl: string | null
  searchText: string
  tokenSet: Set<string>
  normalizedManufacturer: string
}

type ChunkInternal = Omit<ManualKnowledgeChunk, "matchScore"> & {
  searchText: string
  tokenSet: Set<string>
  matchScore: number
}

type ManualKnowledgeArtifact = {
  manuals: ManualInternal[]
  manualById: Map<string, ManualInternal>
  chunks: ChunkInternal[]
  chunksByManualId: Map<string, ChunkInternal[]>
}

type ResettableManualKnowledgeProvider = ManualKnowledgeProvider & {
  resetCache(): void
}

let defaultProvider: ManualKnowledgeProvider | null = null

export function getManualKnowledgeProvider(): ManualKnowledgeProvider {
  if (!defaultProvider) {
    defaultProvider = new LocalManualKnowledgeProvider()
  }
  return defaultProvider
}

export async function findManualCandidates(query: string) {
  return await getManualKnowledgeProvider().findManualCandidates(query)
}

export async function retrieveManualContext(
  query: string,
  filters?: ManualKnowledgeFilters
) {
  return await getManualKnowledgeProvider().retrieveManualContext(query, filters)
}

export async function getManualCitationContext(
  manualId: string,
  pageNumber?: number
) {
  return await getManualKnowledgeProvider().getManualCitationContext(
    manualId,
    pageNumber
  )
}

export function resetManualKnowledgeCache() {
  if (
    defaultProvider &&
    "resetCache" in defaultProvider &&
    typeof (defaultProvider as ResettableManualKnowledgeProvider).resetCache ===
      "function"
  ) {
    ;(defaultProvider as ResettableManualKnowledgeProvider).resetCache()
  }
}

export function summarizeManualRetrieval(args: {
  ran: boolean
  query: string
  result?: RetrieveManualContextResult | null
  error?: unknown
}): ManualRetrievalSummary {
  const error =
    args.error instanceof Error
      ? args.error.message
      : typeof args.error === "string"
        ? args.error
        : null

  if (!args.ran) {
    return {
      ran: false,
      query: args.query,
      bestManualId: null,
      manualCandidateIds: [],
      topChunkCitations: [],
      needsClarification: null,
      isRisky: null,
      error,
    }
  }

  return {
    ran: true,
    query: args.query,
    bestManualId: args.result?.bestManual?.manualId || null,
    manualCandidateIds: (args.result?.manualCandidates || [])
      .slice(0, 4)
      .map((candidate) => candidate.manualId),
    topChunkCitations: (args.result?.topChunks || [])
      .slice(0, 4)
      .map((chunk) => chunk.citation),
    needsClarification: args.result?.needsClarification ?? null,
    isRisky: args.result?.isRisky ?? null,
    error,
  }
}

export function shouldUseManualKnowledgeForChat(
  intent: string | null | undefined,
  query: string
) {
  const normalizedIntent = normalizeText(intent || "")
  const normalizedQuery = normalizeText(query)
  const hasManualHints =
    looksLikeModelTokenQuery(normalizedQuery) ||
    MANUAL_QUERY_HINTS.some((hint) => normalizedQuery.includes(hint))

  if (!normalizedQuery) {
    return false
  }

  if (
    normalizedIntent.includes("manual") ||
    normalizedIntent.includes("repair") ||
    normalizedIntent.includes("parts")
  ) {
    return hasManualHints
  }

  if (looksLikeModelTokenQuery(normalizedQuery)) {
    return true
  }

  return hasManualHints
}

export function formatManualContextForPrompt(
  result: RetrieveManualContextResult
) {
  const lines = [
    "Manual knowledge context:",
    result.topChunks.length > 0
      ? "- Use only the excerpts below for any manuals, parts, or troubleshooting reply."
      : "- No reliable manual excerpt was found.",
    result.isRisky
      ? "- The question looks technical or risky. Stay high-level and safe, and do not provide procedural repair steps."
      : "- Stay limited to simple identification, likely issue category, and very basic safe checks.",
  ]

  if (result.manualCandidates.length > 0) {
    lines.push("Likely manual candidates:")
    for (const candidate of result.manualCandidates.slice(0, 3)) {
      lines.push(
        `- ${buildManualLabel(candidate.filename, candidate.manufacturer)}`
      )
    }
  }

  if (result.topChunks.length === 0) {
    lines.push(
      "- Ask for the brand on the front, model sticker, or a clear photo/video, and offer texting it in for the team to review."
    )
    return lines.join("\n")
  }

  if (result.needsClarification) {
    lines.push(
      "- Confidence is limited. Ask for the brand on the front, the model sticker, or a clear photo/video before sounding certain."
    )
  }

  lines.push("Grounded excerpts:")
  for (const chunk of result.topChunks.slice(0, 3)) {
    lines.push(`- ${chunk.citation}: ${truncateText(chunk.text, 420)}`)
  }

  return lines.join("\n")
}

class LocalManualKnowledgeProvider implements ManualKnowledgeProvider {
  private artifactPromise: Promise<ManualKnowledgeArtifact> | null = null

  async findManualCandidates(query: string) {
    const artifact = await this.getArtifact()
    const prepared = prepareQuery(query)

    return artifact.manuals
      .map((manual) => buildCandidate(manual, scoreManual(manual, prepared)))
      .filter((candidate) => candidate.score > 0)
      .sort(compareByScore)
      .slice(0, 6)
  }

  async retrieveManualContext(
    query: string,
    filters?: ManualKnowledgeFilters
  ): Promise<RetrieveManualContextResult> {
    const artifact = await this.getArtifact()
    const prepared = prepareQuery(query, filters)

    const manualCandidates = artifact.manuals
      .map((manual) => {
        const score = scoreManual(manual, prepared)
        return { manual, score }
      })
      .filter((entry) => entry.score > 0)
      .sort((left, right) => right.score - left.score)

    const candidateIds = new Set(
      manualCandidates.slice(0, 10).map((entry) => entry.manual.manualId)
    )

    const topChunks = artifact.chunks
      .map((chunk) => {
        const manualScore =
          manualCandidates.find((entry) => entry.manual.manualId === chunk.manualId)
            ?.score ?? 0

        const score = scoreChunk(
          chunk,
          prepared,
          candidateIds.size > 0 && candidateIds.has(chunk.manualId)
            ? manualScore
            : manualScore * 0.35
        )

        return { chunk, score }
      })
      .filter((entry) => entry.score > 10)
      .sort((left, right) => right.score - left.score)
      .slice(0, 5)
      .map(({ chunk, score }) => ({
        ...stripChunkForResponse(chunk),
        matchScore: score,
      }))

    const needsClarification =
      topChunks.length === 0 ||
      topChunks[0].matchScore < 24 ||
      (manualCandidates[0]
        ? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score)
            .confidence < 0.48
        : true)

    const bestManualFromChunks =
      topChunks.length > 0
        ? manualCandidates.find(
            (entry) => entry.manual.manualId === topChunks[0].manualId
          )
        : null
    const bestManual = bestManualFromChunks
      ? buildCandidate(bestManualFromChunks.manual, bestManualFromChunks.score)
      : manualCandidates.length > 0
        ? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score)
        : null

    return {
      query,
      bestManual,
      manualCandidates: manualCandidates
        .slice(0, 4)
        .map(({ manual, score }) => buildCandidate(manual, score)),
      topChunks,
      needsClarification,
      isRisky: isRiskyManualQuery(query),
    }
  }

  async getManualCitationContext(manualId: string, pageNumber?: number) {
    const artifact = await this.getArtifact()
    const manual = artifact.manualById.get(manualId) || null
    const chunks = artifact.chunksByManualId.get(manualId) || []
    const citations = chunks
      .filter((chunk) =>
        typeof pageNumber === "number" ? chunk.pageNumber === pageNumber : true
      )
      .slice(0, 5)
      .map(stripChunkForResponse)

    return {
      manual: manual ? buildCandidate(manual, 1) : null,
      citations,
    }
  }

  private async getArtifact() {
    if (!this.artifactPromise) {
      this.artifactPromise = buildArtifact()
    }

    return await this.artifactPromise
  }

  resetCache() {
    this.artifactPromise = null
  }
}

async function buildArtifact(): Promise<ManualKnowledgeArtifact> {
  const sharedArtifact = await loadSharedPlatformArtifact()
  if (sharedArtifact) {
    return sharedArtifact
  }

  const manuals = await loadManualCatalog()
  const manualById = new Map<string, ManualInternal>()

  for (const manual of manuals) {
    manualById.set(manual.manualId, manual)
  }

  const chunks: ChunkInternal[] = []
  const chunksByManualId = new Map<string, ChunkInternal[]>()
  const extractedByFilename = await loadExtractedContentByFilename()
  const structuredRecords = await loadStructuredManualRecords()
  const manualsWithStructuredChunks = new Set<string>()

  for (const record of structuredRecords) {
    const matchedManual = matchStructuredRecordToManual(record, manuals)
    const manual = ensureManualEntry({
      manualById,
      manuals,
      matchedManual,
      fallbackManualId: normalizeIdentifier(record.manualId || "unknown-manual"),
      filename: matchedManual?.filename || `${record.manualId || "manual"}.pdf`,
      manufacturer:
        matchedManual?.manufacturer ||
        humanizeToken(record.manufacturer || "Unknown"),
      category: matchedManual?.category || record.manualType || "manual",
      model: normalizeNullable(record.model),
      manualType: normalizeManualType(record.manualType),
      manualUrl: matchedManual?.manualUrl || null,
      thumbnailUrl: matchedManual?.thumbnailUrl || null,
    })

    const structuredChunks = buildStructuredChunks(record, manual)
    if (structuredChunks.length > 0) {
      manualsWithStructuredChunks.add(manual.manualId)
      addChunks(chunks, chunksByManualId, structuredChunks)
    }
  }

  for (const extracted of extractedByFilename.values()) {
    const manual = matchExtractedRecordToManual(extracted, manuals)
    const sourceManual = manual
      ? ensureManualEntry({
          manualById,
          manuals,
          matchedManual: manual,
          fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "")),
          filename: manual.filename,
          manufacturer: manual.manufacturer,
          category: manual.category,
          model: manual.model,
          manualType: manual.manualType,
          manualUrl: manual.manualUrl,
          thumbnailUrl: manual.thumbnailUrl,
        })
      : ensureManualEntry({
          manualById,
          manuals,
          matchedManual: null,
          fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "manual")),
          filename: extracted.filename || "manual.pdf",
          manufacturer: "Unknown",
          category: "manual",
          model: null,
          manualType: "manual",
          manualUrl: null,
          thumbnailUrl: null,
        })

    if (manualsWithStructuredChunks.has(sourceManual.manualId)) {
      continue
    }

    const fallbackChunks = buildExtractedFallbackChunks(extracted, sourceManual)
    if (fallbackChunks.length > 0) {
      addChunks(chunks, chunksByManualId, fallbackChunks)
    }
  }

  return {
    manuals,
    manualById,
    chunks,
    chunksByManualId,
  }
}

async function loadSharedPlatformArtifact(): Promise<ManualKnowledgeArtifact | null> {
  const platformRoot = resolveManualsPlatformRoot()
  if (!platformRoot) {
    return null
  }

  const tenantId = getManualsPlatformTenantId()
  const tenantDir = join(platformRoot, "output", "tenants", tenantId)
  const manualsPath = join(tenantDir, "manuals.json")
  const chunksPath = join(tenantDir, "chunks.json")

  if (!existsSync(manualsPath) || !existsSync(chunksPath)) {
    return null
  }

  const [manualRecords, chunkRecords] = await Promise.all([
    readJsonFile<SharedPlatformManualRecord[]>(manualsPath),
    readJsonFile<SharedPlatformChunkRecord[]>(chunksPath),
  ])

  const manuals: ManualInternal[] = manualRecords.map((manual) => {
    const searchTerms = [
      manual.filename,
      manual.sourcePath,
      manual.manufacturer,
      manual.category,
      manual.model,
      manual.manualType,
      ...(manual.sourceFilenames || []),
    ]
      .filter(Boolean)
      .join(" ")

    return {
      manualId: manual.manualId,
      filename: manual.filename,
      manufacturer: manual.manufacturer,
      category: manual.category || manual.manualType || "manual",
      model: manual.model || null,
      manualType: normalizeManualType(manual.manualType),
      manualUrl: toSiteManualUrl(manual.manualUrl || manual.sourcePath || null),
      thumbnailUrl: toSiteThumbnailUrl(manual.thumbnailUrl || null),
      searchText: normalizeText(searchTerms),
      tokenSet: new Set(tokenize(searchTerms)),
      normalizedManufacturer: canonicalManufacturer(manual.manufacturer),
    }
  })

  const manualById = new Map<string, ManualInternal>()
  for (const manual of manuals) {
    manualById.set(manual.manualId, manual)
  }

  const chunks: ChunkInternal[] = chunkRecords.map((chunk) => {
    const manual = manualById.get(chunk.manualId) || null
    const filename = manual?.filename || humanizeToken(chunk.title || chunk.manualId)
    const manufacturer = manual?.manufacturer || chunk.manufacturer
    const manualUrl = manual?.manualUrl || toSiteManualUrl(chunk.manualUrl || null)
    const thumbnailUrl =
      manual?.thumbnailUrl || toSiteThumbnailUrl(chunk.thumbnailUrl || null)
    const searchText = normalizeText(
      [
        filename,
        manufacturer,
        chunk.model,
        chunk.sectionTitle,
        chunk.text,
        chunk.manualType,
      ]
        .filter(Boolean)
        .join(" ")
    )

    return {
      manualId: chunk.manualId,
      filename,
      manufacturer,
      model: chunk.model,
      manualType: normalizeManualType(chunk.manualType),
      pageNumber: chunk.pageNumber,
      sectionTitle: chunk.sectionTitle,
      text: chunk.text,
      manualUrl,
      thumbnailUrl,
      sourceConfidence: clamp(
        chunk.metadataConfidence ?? chunk.overallQuality ?? 0.76
      ),
      matchScore: 0,
      citation: buildCitation(filename, manufacturer, chunk.pageNumber),
      searchText,
      tokenSet: new Set(tokenize(searchText)),
    }
  })

  const chunksByManualId = new Map<string, ChunkInternal[]>()
  for (const chunk of chunks) {
    const existing = chunksByManualId.get(chunk.manualId) || []
    existing.push(chunk)
    chunksByManualId.set(chunk.manualId, existing)
  }

  return {
    manuals,
    manualById,
    chunks,
    chunksByManualId,
  }
}

function resolveManualsPlatformRoot() {
  const candidates = [
    process.env.MANUALS_PLATFORM_ROOT,
    join(process.cwd(), "..", "manuals-platform"),
    join(process.cwd(), "manuals-platform"),
  ].filter(Boolean) as string[]

  for (const candidate of candidates) {
    if (existsSync(candidate)) {
      return candidate
    }
  }

  return null
}

function getManualsPlatformTenantId() {
  const value =
    process.env.MANUALS_PLATFORM_TENANT_ID ||
    process.env.SITE_MANUALS_TENANT_ID ||
    DEFAULT_MANUALS_PLATFORM_TENANT_ID

  return value.trim() || DEFAULT_MANUALS_PLATFORM_TENANT_ID
}

function toSiteManualUrl(value: string | null) {
  if (!value) {
    return null
  }

  if (/^https?:\/\//i.test(value)) {
    return value
  }

  const relativePath = value.replace(/^manuals\//i, "").replace(/^\/manuals\//i, "")
  return buildManualAssetUrl(relativePath)
}

function toSiteThumbnailUrl(value: string | null) {
  if (!value) {
    return null
  }

  if (/^https?:\/\//i.test(value)) {
    return value
  }

  const relativePath = value
    .replace(/^thumbnails\//i, "")
    .replace(/^\/thumbnails\//i, "")
  return buildThumbnailAssetUrl(relativePath)
}

async function loadManualCatalog() {
  const convexManuals = await listConvexManuals().catch(() => [] as Manual[])
  const filesystemManuals =
    convexManuals.length > 0 ? [] : await scanManuals().catch(() => [] as Manual[])

  const sourceManuals = convexManuals.length > 0 ? convexManuals : filesystemManuals
  const deduped = new Map<string, ManualInternal>()

  for (const manual of sourceManuals) {
    const manualId = normalizeIdentifier(manual.path || manual.filename)
    const manufacturer = manual.manufacturer || "Unknown"
    const filename = manual.filename || basename(manual.path)
    const category = manual.category || "manual"
    const model = guessModelFromManual(manual)
    const searchTerms = [
      filename,
      stripExtension(filename),
      manual.path,
      manufacturer,
      category,
      ...(manual.searchTerms || []),
      ...(manual.commonNames || []),
      ...aliasesForManufacturer(manufacturer),
      model || "",
    ]
      .filter(Boolean)
      .join(" ")

    deduped.set(manualId, {
      manualId,
      filename,
      manufacturer,
      category,
      model,
      manualType: normalizeManualType(category),
      manualUrl: getManualUrl(manual),
      thumbnailUrl: getThumbnailUrl(manual),
      searchText: normalizeText(searchTerms),
      tokenSet: new Set(tokenize(searchTerms)),
      normalizedManufacturer: canonicalManufacturer(manufacturer),
    })
  }

  return Array.from(deduped.values())
}

async function loadStructuredManualRecords() {
  const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR)
  const entries = await readdir(directory, { withFileTypes: true })
  const files = entries
    .filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json"))
    .map((entry) => entry.name)

  return await Promise.all(
    files.map(async (filename) => {
      const path = join(directory, filename)
      const parsed = await readJsonFile<StructuredManualRecord>(path)
      return {
        ...parsed,
        manualId: parsed.manualId || stripExtension(filename),
      }
    })
  )
}

async function loadExtractedContentByFilename() {
  const path = join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE)
  const records = await readJsonFile<ExtractedManualRecord[]>(path)
  const map = new Map<string, ExtractedManualRecord>()

  for (const record of records) {
    if (!record.filename) {
      continue
    }

    map.set(normalizeIdentifier(record.filename), record)
  }

  return map
}

function matchStructuredRecordToManual(
  record: StructuredManualRecord,
  manuals: ManualInternal[]
) {
  const query = [
    record.manualId,
    record.manufacturer,
    record.model,
    record.manualType,
  ]
    .filter(Boolean)
    .join(" ")

  const prepared = prepareQuery(query, {
    manufacturer: record.manufacturer,
    model: record.model,
  })

  const best = manuals
    .map((manual) => ({
      manual,
      score: scoreManual(manual, prepared),
    }))
    .sort((left, right) => right.score - left.score)[0]

  return best && best.score >= 12 ? best.manual : null
}

function matchExtractedRecordToManual(
  record: ExtractedManualRecord,
  manuals: ManualInternal[]
) {
  const filename = record.filename || ""
  const normalizedFilename = normalizeIdentifier(filename)
  const exactMatch = manuals.find(
    (manual) =>
      normalizeIdentifier(manual.filename) === normalizedFilename ||
      normalizeIdentifier(manual.manualId) === normalizedFilename
  )

  if (exactMatch) {
    return exactMatch
  }

  const query = stripExtension(filename)
  const prepared = prepareQuery(query)
  const best = manuals
    .map((manual) => ({
      manual,
      score: scoreManual(manual, prepared),
    }))
    .sort((left, right) => right.score - left.score)[0]

  return best && best.score >= 10 ? best.manual : null
}

function buildStructuredChunks(
  record: StructuredManualRecord,
  manual: ManualInternal
) {
  const chunks: ChunkInternal[] = []
  const sections = record.content?.sections || []
  const troubleshooting = record.content?.troubleshooting || []
  const parts = record.content?.partsDatabase || []

  for (const section of sections) {
    const text = cleanChunkText(section.text || "")
    if (!isUsefulChunkText(text)) {
      continue
    }

    chunks.push(
      makeChunk({
        manual,
        pageNumber: section.pageNumber ?? null,
        sectionTitle: cleanSectionTitle(section.title),
        text,
        sourceConfidence: 0.92,
      })
    )
  }

  for (const item of troubleshooting) {
    const problem = cleanChunkText(item.problem || "")
    const solution = cleanChunkText(item.solution || "")
    const text = cleanChunkText(
      [problem ? `Problem: ${problem}` : "", solution ? `Likely cause or solution: ${solution}` : ""]
        .filter(Boolean)
        .join("\n")
    )

    if (!isUsefulChunkText(text)) {
      continue
    }

    chunks.push(
      makeChunk({
        manual,
        pageNumber: item.pageNumber ?? null,
        sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting",
        text,
        sourceConfidence: 0.95,
      })
    )
  }

  const partChunks = buildPartChunks(parts, manual, 0.8)
  addChunks(chunks, new Map<string, ChunkInternal[]>(), partChunks)

  return chunks
}

function buildExtractedFallbackChunks(
  record: ExtractedManualRecord,
  manual: ManualInternal
) {
  const chunks: ChunkInternal[] = []
  const pages = record.text?.pages || []

  for (const page of pages) {
    const text = cleanChunkText(page.text || "")
    if (!isUsefulChunkText(text)) {
      continue
    }

    chunks.push(
      makeChunk({
        manual,
        pageNumber: page.pageNumber ?? null,
        sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "Manual page",
        text,
        sourceConfidence: 0.72,
      })
    )
  }

  const partChunks = buildExtractedPartListChunks(record.partsLists || [], manual)
  addChunks(chunks, new Map<string, ChunkInternal[]>(), partChunks)

  return chunks
}

function buildPartChunks(
  parts: StructuredPart[],
  manual: ManualInternal,
  sourceConfidence: number
) {
  const partsByPage = new Map<number, string[]>()

  for (const part of parts) {
    const partNumber = cleanChunkText(part.partNumber || "")
    const description = cleanChunkText(part.description || "")
    if (partNumber.length < 2 && description.length < 6) {
      continue
    }

    const pageNumber = part.pageNumber ?? 0
    const summary = description
      ? `Part ${partNumber}: ${description}`
      : `Part ${partNumber}`

    const bucket = partsByPage.get(pageNumber) || []
    if (bucket.length < 8) {
      bucket.push(summary)
      partsByPage.set(pageNumber, bucket)
    }
  }

  return Array.from(partsByPage.entries()).map(([pageNumber, summaries]) =>
    makeChunk({
      manual,
      pageNumber: pageNumber || null,
      sectionTitle: "Parts reference",
      text: summaries.join("\n"),
      sourceConfidence,
    })
  )
}

function buildExtractedPartListChunks(
  partLists: ExtractedPartList[],
  manual: ManualInternal
) {
  const flattened: StructuredPart[] = []

  for (const partList of partLists) {
    for (const part of partList.parts || []) {
      flattened.push({
        ...part,
        pageNumber: part.pageNumber ?? partList.pageNumber,
      })
    }
  }

  return buildPartChunks(flattened, manual, 0.76)
}

function addChunks(
  target: ChunkInternal[],
  chunksByManualId: Map<string, ChunkInternal[]>,
  chunks: ChunkInternal[]
) {
  for (const chunk of chunks) {
    target.push(chunk)

    const existing = chunksByManualId.get(chunk.manualId) || []
    existing.push(chunk)
    chunksByManualId.set(chunk.manualId, existing)
  }
}

function ensureManualEntry(args: {
  manualById: Map<string, ManualInternal>
  manuals: ManualInternal[]
  matchedManual: ManualInternal | null
  fallbackManualId: string
  filename: string
  manufacturer: string
  category: string
  model: string | null
  manualType: string
  manualUrl: string | null
  thumbnailUrl: string | null
}) {
  if (args.matchedManual) {
    return args.matchedManual
  }

  const manualId = normalizeIdentifier(args.fallbackManualId)
  const existing = args.manualById.get(manualId)
  if (existing) {
    return existing
  }

  const fallback: ManualInternal = {
    manualId,
    filename: args.filename,
    manufacturer: args.manufacturer,
    category: args.category,
    model: args.model,
    manualType: args.manualType,
    manualUrl: args.manualUrl,
    thumbnailUrl: args.thumbnailUrl,
    searchText: normalizeText(
      [
        args.filename,
        args.manufacturer,
        args.category,
        args.model,
        args.manualType,
      ]
        .filter(Boolean)
        .join(" ")
    ),
    tokenSet: new Set(
      tokenize(
        [
          args.filename,
          args.manufacturer,
          args.category,
          args.model,
          args.manualType,
        ]
          .filter(Boolean)
          .join(" ")
      )
    ),
    normalizedManufacturer: canonicalManufacturer(args.manufacturer),
  }

  args.manuals.push(fallback)
  args.manualById.set(fallback.manualId, fallback)
  return fallback
}

function prepareQuery(query: string, filters?: ManualKnowledgeFilters): PreparedQuery {
  const normalized = normalizeText(query)
  const tokenSet = new Set(tokenize(normalized))
  const modelTokens = extractModelTokens(normalized)
  const manufacturerFilters = new Set<string>()

  for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
    for (const alias of aliases) {
      if (normalized.includes(normalizeText(alias))) {
        manufacturerFilters.add(canonical)
      }
    }
  }

  if (filters?.manufacturer) {
    manufacturerFilters.add(canonicalManufacturer(filters.manufacturer))
  }

  if (filters?.model) {
    tokenSet.add(normalizeText(filters.model))
  }

  return {
    normalized,
    tokens: Array.from(tokenSet),
    modelTokens,
    manufacturerFilters: Array.from(manufacturerFilters),
    manualIdFilter: filters?.manualId
      ? normalizeIdentifier(filters.manualId)
      : null,
  }
}

function scoreManual(manual: ManualInternal, query: PreparedQuery) {
  if (!query.normalized) {
    return 0
  }

  let score = 0

  if (query.manualIdFilter) {
    score += query.manualIdFilter === manual.manualId ? 28 : -12
  }

  if (query.manufacturerFilters.length > 0) {
    score += query.manufacturerFilters.includes(manual.normalizedManufacturer)
      ? 14
      : -6
  }

  if (manual.searchText.includes(query.normalized)) {
    score += 24
  }

  for (const token of query.tokens) {
    if (!token) {
      continue
    }

    if (manual.tokenSet.has(token)) {
      score += query.modelTokens.includes(token) ? 8 : 4
      continue
    }

    if (token.length >= 4 && manual.searchText.includes(token)) {
      score += 1.5
    }
  }

  for (const token of query.modelTokens) {
    if (manual.searchText.includes(token)) {
      score += 8
    }
  }

  if (
    query.normalized.includes("error") ||
    query.normalized.includes("not ") ||
    query.normalized.includes("coin") ||
    query.normalized.includes("bill")
  ) {
    if (
      manual.manualType.includes("repair") ||
      manual.manualType.includes("service") ||
      manual.manualType.includes("parts")
    ) {
      score += 6
    }
  }

  return score
}

function scoreChunk(
  chunk: ChunkInternal,
  query: PreparedQuery,
  manualScore: number
) {
  if (!query.normalized) {
    return manualScore
  }

  let score = manualScore + chunk.sourceConfidence * 8

  if (query.manufacturerFilters.length > 0) {
    score += query.manufacturerFilters.includes(canonicalManufacturer(chunk.manufacturer))
      ? 6
      : -4
  }

  if (chunk.searchText.includes(query.normalized)) {
    score += 18
  }

  for (const token of query.tokens) {
    if (!token) {
      continue
    }

    if (chunk.tokenSet.has(token)) {
      score += query.modelTokens.includes(token) ? 7 : 3
      continue
    }

    if (token.length >= 5 && chunk.searchText.includes(token)) {
      score += 1
    }
  }

  if (
    query.normalized.includes("parts") ||
    query.normalized.includes("part") ||
    query.normalized.includes("bill") ||
    query.normalized.includes("coin")
  ) {
    if ((chunk.sectionTitle || "").toLowerCase().includes("parts")) {
      score += 6
    }
  }

  if (
    query.normalized.includes("error") ||
    query.normalized.includes("not ") ||
    query.normalized.includes("won t") ||
    query.normalized.includes("wont")
  ) {
    if ((chunk.sectionTitle || "").toLowerCase().includes("troubleshooting")) {
      score += 5
    }
  }

  return score
}

function buildCandidate(manual: ManualInternal, score: number): ManualCandidate {
  return {
    manualId: manual.manualId,
    filename: manual.filename,
    manufacturer: manual.manufacturer,
    category: manual.category,
    manualUrl: manual.manualUrl,
    thumbnailUrl: manual.thumbnailUrl,
    score,
    confidence: clamp(score / 38),
  }
}

function makeChunk(args: {
  manual: ManualInternal
  pageNumber: number | null
  sectionTitle: string | null
  text: string
  sourceConfidence: number
}) {
  const searchText = normalizeText(
    [
      args.manual.filename,
      args.manual.manufacturer,
      args.manual.model,
      args.sectionTitle,
      args.text,
    ]
      .filter(Boolean)
      .join(" ")
  )

  return {
    manualId: args.manual.manualId,
    filename: args.manual.filename,
    manufacturer: args.manual.manufacturer,
    model: args.manual.model,
    manualType: args.manual.manualType,
    pageNumber: args.pageNumber,
    sectionTitle: args.sectionTitle,
    text: args.text,
    manualUrl: args.manual.manualUrl,
    thumbnailUrl: args.manual.thumbnailUrl,
    sourceConfidence: args.sourceConfidence,
    matchScore: 0,
    citation: buildCitation(
      args.manual.filename,
      args.manual.manufacturer,
      args.pageNumber
    ),
    searchText,
    tokenSet: new Set(tokenize(searchText)),
  } satisfies ChunkInternal
}

function stripChunkForResponse(chunk: ChunkInternal): ManualKnowledgeChunk {
  return {
    manualId: chunk.manualId,
    filename: chunk.filename,
    manufacturer: chunk.manufacturer,
    model: chunk.model,
    manualType: chunk.manualType,
    pageNumber: chunk.pageNumber,
    sectionTitle: chunk.sectionTitle,
    text: chunk.text,
    manualUrl: chunk.manualUrl,
    thumbnailUrl: chunk.thumbnailUrl,
    sourceConfidence: chunk.sourceConfidence,
    matchScore: chunk.matchScore,
    citation: chunk.citation,
  }
}

function cleanChunkText(value: string) {
  return value
    .replace(/\u00ad/g, "")
    .replace(/\s+/g, " ")
    .trim()
}

function cleanSectionTitle(value?: string) {
  const cleaned = cleanChunkText(value || "")
  return cleaned || null
}

function isUsefulChunkText(text: string) {
  if (!text) {
    return false
  }

  if (text.length < 70) {
    return /error|vend|coin|bill|cool|stuck|motor|sensor|jam|door|part/i.test(
      text
    )
  }

  const alphaChars = text.replace(/[^a-z]/gi, "").length
  if (alphaChars < 25) {
    return false
  }

  if (/^(table of contents|contact us)$/i.test(text)) {
    return false
  }

  return true
}

function normalizeText(value: string) {
  return value
    .normalize("NFKD")
    .replace(/[^a-zA-Z0-9]+/g, " ")
    .toLowerCase()
    .trim()
}

function tokenize(value: string) {
  return normalizeText(value)
    .split(" ")
    .map((token) => token.trim())
    .filter(
      (token) =>
        token.length > 1 &&
        !STOPWORDS.has(token) &&
        !/^page\d*$/.test(token)
    )
}

function normalizeIdentifier(value: string) {
  return normalizeText(stripExtension(value)).replace(/\s+/g, "-")
}

function stripExtension(value: string) {
  return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "")
}

function canonicalManufacturer(value: string) {
  const normalized = normalizeText(value)

  for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
    if (
      canonical === normalized ||
      aliases.some((alias) => normalized.includes(normalizeText(alias)))
    ) {
      return canonical
    }
  }

  return normalized
}

function aliasesForManufacturer(value: string) {
  const canonical = canonicalManufacturer(value)
  return MANUFACTURER_ALIASES[canonical] || [value]
}

function guessModelFromManual(manual: Manual) {
  const filename = normalizeText(stripExtension(manual.filename || ""))
  const modelToken = extractModelTokens(filename)[0]
  return modelToken || null
}

function normalizeManualType(value?: string | null) {
  const normalized = normalizeText(value || "")
  if (!normalized) {
    return "manual"
  }

  if (normalized.includes("part")) {
    return "parts"
  }

  if (normalized.includes("operator")) {
    return "operator"
  }

  if (normalized.includes("service")) {
    return "service"
  }

  return normalized.replace(/\s+/g, "-")
}

function normalizeNullable(value?: string | null) {
  const normalized = cleanChunkText(value || "")
  return normalized || null
}

function extractModelTokens(value: string) {
  const matches =
    normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9]*\b/g) || []
  return Array.from(new Set(matches))
}

function looksLikeModelTokenQuery(value: string) {
  return extractModelTokens(value).length > 0
}

function isRiskyManualQuery(value: string) {
  const normalized = normalizeText(value)
  return RISKY_MANUAL_KEYWORDS.some((keyword) =>
    normalized.includes(normalizeText(keyword))
  )
}

function buildCitation(
  filename: string,
  manufacturer: string,
  pageNumber: number | null
) {
  return `${buildManualLabel(filename, manufacturer)}${
    pageNumber ? `, page ${pageNumber}` : ""
  }`
}

function buildManualLabel(filename: string, manufacturer: string) {
  const stem = humanizeToken(stripExtension(filename))
  const prefix = manufacturer ? `${manufacturer} ` : ""
  return `${prefix}${stem}`.trim()
}

function humanizeToken(value: string) {
  return value
    .replace(/[-_]+/g, " ")
    .replace(/\s+/g, " ")
    .trim()
}

function compareByScore(left: ManualCandidate, right: ManualCandidate) {
  return right.score - left.score
}

function clamp(value: number) {
  return Math.max(0, Math.min(1, value))
}

function truncateText(value: string, maxLength: number) {
  if (value.length <= maxLength) {
    return value
  }

  return `${value.slice(0, maxLength - 1).trimEnd()}…`
}

async function readJsonFile<T>(path: string) {
  return JSON.parse(await readFile(path, "utf8")) as T
}