1604 lines
39 KiB
TypeScript
1604 lines
39 KiB
TypeScript
import { existsSync } from "node:fs"
|
|
import { readFile, readdir } from "node:fs/promises"
|
|
import { basename, join } from "node:path"
|
|
import { listConvexManuals } from "@/lib/convex-service"
|
|
import { scanManuals } from "@/lib/manuals"
|
|
import { getManualsDataRoot } from "@/lib/manuals-paths"
|
|
import {
|
|
buildManualAssetUrl,
|
|
buildThumbnailAssetUrl,
|
|
} from "@/lib/manuals-storage"
|
|
import type { Manual } from "@/lib/manuals-types"
|
|
import { getManualUrl, getThumbnailUrl } from "@/lib/manuals-types"
|
|
|
|
const MANUALS_OPTIMIZED_ROOT = ["manuals_optimized"]
|
|
const STRUCTURED_MANUALS_DIR = [
|
|
...MANUALS_OPTIMIZED_ROOT,
|
|
"training_data",
|
|
"structured",
|
|
]
|
|
const EXTRACTED_CONTENT_FILE = [
|
|
...MANUALS_OPTIMIZED_ROOT,
|
|
"extracted_content",
|
|
"manuals_extracted_content.json",
|
|
]
|
|
const DEFAULT_MANUALS_PLATFORM_TENANT_ID = "rocky-mountain-vending"
|
|
|
|
const STOPWORDS = new Set([
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"are",
|
|
"at",
|
|
"be",
|
|
"by",
|
|
"for",
|
|
"from",
|
|
"help",
|
|
"how",
|
|
"i",
|
|
"in",
|
|
"is",
|
|
"it",
|
|
"machine",
|
|
"manual",
|
|
"me",
|
|
"my",
|
|
"of",
|
|
"on",
|
|
"or",
|
|
"our",
|
|
"please",
|
|
"service",
|
|
"that",
|
|
"the",
|
|
"this",
|
|
"to",
|
|
"up",
|
|
"with",
|
|
])
|
|
|
|
const RISKY_MANUAL_KEYWORDS = [
|
|
"wiring",
|
|
"diagram",
|
|
"voltage",
|
|
"compressor",
|
|
"refrigerant",
|
|
"bypass",
|
|
"jumper",
|
|
"board level",
|
|
"schematic",
|
|
"electrical",
|
|
"rewire",
|
|
"disassemble",
|
|
"tear down",
|
|
]
|
|
|
|
const MANUAL_QUERY_HINTS = [
|
|
"manual",
|
|
"model",
|
|
"serial",
|
|
"error",
|
|
"code",
|
|
"parts",
|
|
"part",
|
|
"troubleshoot",
|
|
"troubleshooting",
|
|
"not cooling",
|
|
"not vending",
|
|
"coin",
|
|
"bill acceptor",
|
|
"bill",
|
|
"coin mech",
|
|
"validator",
|
|
"jam",
|
|
"stuck",
|
|
"door",
|
|
"display",
|
|
"keypad",
|
|
"compressor",
|
|
"motor",
|
|
"sensor",
|
|
]
|
|
|
|
const MANUFACTURER_ALIASES: Record<string, string[]> = {
|
|
"automatic products": ["automatic products", "automatic-products", "ap"],
|
|
"coinco": ["coinco"],
|
|
"crane": [
|
|
"crane",
|
|
"national",
|
|
"national vendors",
|
|
"merchant",
|
|
"merchant series",
|
|
"shopper",
|
|
],
|
|
"dixie narco": [
|
|
"dixie",
|
|
"narco",
|
|
"dixie narco",
|
|
"dixie-narco",
|
|
"dn",
|
|
"bevmax",
|
|
],
|
|
"gpl": ["gpl", "general products"],
|
|
"mei mars": ["mei", "mars", "mei mars", "bill validator"],
|
|
"royal vendors": ["royal", "royal vendors", "royal vendor", "rvv"],
|
|
"rowe": ["rowe"],
|
|
"seaga": ["seaga"],
|
|
"usi": ["usi", "u select it", "u-select-it", "uselectit"],
|
|
"vendo": ["vendo", "sanden"],
|
|
}
|
|
|
|
export type ManualCandidate = {
|
|
manualId: string
|
|
filename: string
|
|
manufacturer: string
|
|
category: string
|
|
manualUrl: string | null
|
|
thumbnailUrl: string | null
|
|
score: number
|
|
confidence: number
|
|
}
|
|
|
|
export type ManualKnowledgeChunk = {
|
|
manualId: string
|
|
filename: string
|
|
manufacturer: string
|
|
model: string | null
|
|
manualType: string
|
|
pageNumber: number | null
|
|
sectionTitle: string | null
|
|
text: string
|
|
manualUrl: string | null
|
|
thumbnailUrl: string | null
|
|
sourceConfidence: number
|
|
matchScore: number
|
|
citation: string
|
|
}
|
|
|
|
export type ManualKnowledgeFilters = {
|
|
manufacturer?: string | null
|
|
manualId?: string | null
|
|
model?: string | null
|
|
}
|
|
|
|
export type RetrieveManualContextResult = {
|
|
query: string
|
|
bestManual: ManualCandidate | null
|
|
manualCandidates: ManualCandidate[]
|
|
topChunks: ManualKnowledgeChunk[]
|
|
needsClarification: boolean
|
|
isRisky: boolean
|
|
}
|
|
|
|
export type ManualCitationContext = {
|
|
manual: ManualCandidate | null
|
|
citations: ManualKnowledgeChunk[]
|
|
}
|
|
|
|
export type ManualRetrievalSummary = {
|
|
ran: boolean
|
|
query: string
|
|
bestManualId: string | null
|
|
manualCandidateIds: string[]
|
|
topChunkCitations: string[]
|
|
needsClarification: boolean | null
|
|
isRisky: boolean | null
|
|
error: string | null
|
|
}
|
|
|
|
export interface ManualKnowledgeProvider {
|
|
findManualCandidates(query: string): Promise<ManualCandidate[]>
|
|
retrieveManualContext(
|
|
query: string,
|
|
filters?: ManualKnowledgeFilters
|
|
): Promise<RetrieveManualContextResult>
|
|
getManualCitationContext(
|
|
manualId: string,
|
|
pageNumber?: number
|
|
): Promise<ManualCitationContext>
|
|
}
|
|
|
|
type StructuredSection = {
|
|
title?: string
|
|
pageNumber?: number
|
|
text?: string
|
|
}
|
|
|
|
type StructuredTroubleshooting = {
|
|
problem?: string
|
|
solution?: string
|
|
pageNumber?: number
|
|
}
|
|
|
|
type StructuredPart = {
|
|
partNumber?: string
|
|
description?: string
|
|
pageNumber?: number
|
|
}
|
|
|
|
type StructuredManualRecord = {
|
|
manualId?: string
|
|
manufacturer?: string
|
|
model?: string
|
|
manualType?: string
|
|
content?: {
|
|
sections?: StructuredSection[]
|
|
troubleshooting?: StructuredTroubleshooting[]
|
|
partsDatabase?: StructuredPart[]
|
|
}
|
|
}
|
|
|
|
type ExtractedPage = {
|
|
pageNumber?: number
|
|
text?: string
|
|
wordCount?: number
|
|
}
|
|
|
|
type ExtractedPartList = {
|
|
pageNumber?: number
|
|
parts?: StructuredPart[]
|
|
}
|
|
|
|
type ExtractedManualRecord = {
|
|
filename?: string
|
|
sections?: StructuredSection[]
|
|
partsLists?: ExtractedPartList[]
|
|
text?: {
|
|
fullText?: string
|
|
pages?: ExtractedPage[]
|
|
}
|
|
}
|
|
|
|
type SharedPlatformManualRecord = {
|
|
manualId: string
|
|
filename: string
|
|
manufacturer: string
|
|
category: string | null
|
|
model: string | null
|
|
manualType: string
|
|
sourcePath?: string | null
|
|
manualUrl?: string | null
|
|
thumbnailUrl?: string | null
|
|
sourceFilenames?: string[]
|
|
}
|
|
|
|
type SharedPlatformChunkRecord = {
|
|
manualId: string
|
|
title?: string
|
|
manufacturer: string
|
|
model: string | null
|
|
manualType: string
|
|
pageNumber: number | null
|
|
sectionTitle: string | null
|
|
text: string
|
|
manualUrl?: string | null
|
|
thumbnailUrl?: string | null
|
|
metadataConfidence?: number
|
|
overallQuality?: number
|
|
}
|
|
|
|
type PreparedQuery = {
|
|
normalized: string
|
|
tokens: string[]
|
|
modelTokens: string[]
|
|
manufacturerFilters: string[]
|
|
manualIdFilter: string | null
|
|
}
|
|
|
|
type ManualInternal = {
|
|
manualId: string
|
|
filename: string
|
|
manufacturer: string
|
|
category: string
|
|
model: string | null
|
|
manualType: string
|
|
manualUrl: string | null
|
|
thumbnailUrl: string | null
|
|
searchText: string
|
|
tokenSet: Set<string>
|
|
normalizedManufacturer: string
|
|
}
|
|
|
|
type ChunkInternal = Omit<ManualKnowledgeChunk, "matchScore"> & {
|
|
searchText: string
|
|
tokenSet: Set<string>
|
|
matchScore: number
|
|
}
|
|
|
|
type ManualKnowledgeArtifact = {
|
|
manuals: ManualInternal[]
|
|
manualById: Map<string, ManualInternal>
|
|
chunks: ChunkInternal[]
|
|
chunksByManualId: Map<string, ChunkInternal[]>
|
|
}
|
|
|
|
type ResettableManualKnowledgeProvider = ManualKnowledgeProvider & {
|
|
resetCache(): void
|
|
}
|
|
|
|
let defaultProvider: ManualKnowledgeProvider | null = null
|
|
|
|
export function getManualKnowledgeProvider(): ManualKnowledgeProvider {
|
|
if (!defaultProvider) {
|
|
defaultProvider = new LocalManualKnowledgeProvider()
|
|
}
|
|
return defaultProvider
|
|
}
|
|
|
|
export async function findManualCandidates(query: string) {
|
|
return await getManualKnowledgeProvider().findManualCandidates(query)
|
|
}
|
|
|
|
export async function retrieveManualContext(
|
|
query: string,
|
|
filters?: ManualKnowledgeFilters
|
|
) {
|
|
return await getManualKnowledgeProvider().retrieveManualContext(query, filters)
|
|
}
|
|
|
|
export async function getManualCitationContext(
|
|
manualId: string,
|
|
pageNumber?: number
|
|
) {
|
|
return await getManualKnowledgeProvider().getManualCitationContext(
|
|
manualId,
|
|
pageNumber
|
|
)
|
|
}
|
|
|
|
export function resetManualKnowledgeCache() {
|
|
if (
|
|
defaultProvider &&
|
|
"resetCache" in defaultProvider &&
|
|
typeof (defaultProvider as ResettableManualKnowledgeProvider).resetCache ===
|
|
"function"
|
|
) {
|
|
;(defaultProvider as ResettableManualKnowledgeProvider).resetCache()
|
|
}
|
|
}
|
|
|
|
export function summarizeManualRetrieval(args: {
|
|
ran: boolean
|
|
query: string
|
|
result?: RetrieveManualContextResult | null
|
|
error?: unknown
|
|
}): ManualRetrievalSummary {
|
|
const error =
|
|
args.error instanceof Error
|
|
? args.error.message
|
|
: typeof args.error === "string"
|
|
? args.error
|
|
: null
|
|
|
|
if (!args.ran) {
|
|
return {
|
|
ran: false,
|
|
query: args.query,
|
|
bestManualId: null,
|
|
manualCandidateIds: [],
|
|
topChunkCitations: [],
|
|
needsClarification: null,
|
|
isRisky: null,
|
|
error,
|
|
}
|
|
}
|
|
|
|
return {
|
|
ran: true,
|
|
query: args.query,
|
|
bestManualId: args.result?.bestManual?.manualId || null,
|
|
manualCandidateIds: (args.result?.manualCandidates || [])
|
|
.slice(0, 4)
|
|
.map((candidate) => candidate.manualId),
|
|
topChunkCitations: (args.result?.topChunks || [])
|
|
.slice(0, 4)
|
|
.map((chunk) => chunk.citation),
|
|
needsClarification: args.result?.needsClarification ?? null,
|
|
isRisky: args.result?.isRisky ?? null,
|
|
error,
|
|
}
|
|
}
|
|
|
|
export function shouldUseManualKnowledgeForChat(
|
|
intent: string | null | undefined,
|
|
query: string
|
|
) {
|
|
const normalizedIntent = normalizeText(intent || "")
|
|
const normalizedQuery = normalizeText(query)
|
|
const hasManualHints =
|
|
looksLikeModelTokenQuery(normalizedQuery) ||
|
|
MANUAL_QUERY_HINTS.some((hint) => normalizedQuery.includes(hint))
|
|
|
|
if (!normalizedQuery) {
|
|
return false
|
|
}
|
|
|
|
if (
|
|
normalizedIntent.includes("manual") ||
|
|
normalizedIntent.includes("repair") ||
|
|
normalizedIntent.includes("parts")
|
|
) {
|
|
return hasManualHints
|
|
}
|
|
|
|
if (looksLikeModelTokenQuery(normalizedQuery)) {
|
|
return true
|
|
}
|
|
|
|
return hasManualHints
|
|
}
|
|
|
|
export function formatManualContextForPrompt(
|
|
result: RetrieveManualContextResult
|
|
) {
|
|
const lines = [
|
|
"Manual knowledge context:",
|
|
result.topChunks.length > 0
|
|
? "- Use only the excerpts below for any manuals, parts, or troubleshooting reply."
|
|
: "- No reliable manual excerpt was found.",
|
|
result.isRisky
|
|
? "- The question looks technical or risky. Stay high-level and safe, and do not provide procedural repair steps."
|
|
: "- Stay limited to simple identification, likely issue category, and very basic safe checks.",
|
|
]
|
|
|
|
if (result.manualCandidates.length > 0) {
|
|
lines.push("Likely manual candidates:")
|
|
for (const candidate of result.manualCandidates.slice(0, 3)) {
|
|
lines.push(
|
|
`- ${buildManualLabel(candidate.filename, candidate.manufacturer)}`
|
|
)
|
|
}
|
|
}
|
|
|
|
if (result.topChunks.length === 0) {
|
|
lines.push(
|
|
"- Ask for the brand on the front, model sticker, or a clear photo/video, and offer texting it in for the team to review."
|
|
)
|
|
return lines.join("\n")
|
|
}
|
|
|
|
if (result.needsClarification) {
|
|
lines.push(
|
|
"- Confidence is limited. Ask for the brand on the front, the model sticker, or a clear photo/video before sounding certain."
|
|
)
|
|
}
|
|
|
|
lines.push("Grounded excerpts:")
|
|
for (const chunk of result.topChunks.slice(0, 3)) {
|
|
lines.push(`- ${chunk.citation}: ${truncateText(chunk.text, 420)}`)
|
|
}
|
|
|
|
return lines.join("\n")
|
|
}
|
|
|
|
class LocalManualKnowledgeProvider implements ManualKnowledgeProvider {
|
|
private artifactPromise: Promise<ManualKnowledgeArtifact> | null = null
|
|
|
|
async findManualCandidates(query: string) {
|
|
const artifact = await this.getArtifact()
|
|
const prepared = prepareQuery(query)
|
|
|
|
return artifact.manuals
|
|
.map((manual) => buildCandidate(manual, scoreManual(manual, prepared)))
|
|
.filter((candidate) => candidate.score > 0)
|
|
.sort(compareByScore)
|
|
.slice(0, 6)
|
|
}
|
|
|
|
async retrieveManualContext(
|
|
query: string,
|
|
filters?: ManualKnowledgeFilters
|
|
): Promise<RetrieveManualContextResult> {
|
|
const artifact = await this.getArtifact()
|
|
const prepared = prepareQuery(query, filters)
|
|
|
|
const manualCandidates = artifact.manuals
|
|
.map((manual) => {
|
|
const score = scoreManual(manual, prepared)
|
|
return { manual, score }
|
|
})
|
|
.filter((entry) => entry.score > 0)
|
|
.sort((left, right) => right.score - left.score)
|
|
|
|
const candidateIds = new Set(
|
|
manualCandidates.slice(0, 10).map((entry) => entry.manual.manualId)
|
|
)
|
|
|
|
const topChunks = artifact.chunks
|
|
.map((chunk) => {
|
|
const manualScore =
|
|
manualCandidates.find((entry) => entry.manual.manualId === chunk.manualId)
|
|
?.score ?? 0
|
|
|
|
const score = scoreChunk(
|
|
chunk,
|
|
prepared,
|
|
candidateIds.size > 0 && candidateIds.has(chunk.manualId)
|
|
? manualScore
|
|
: manualScore * 0.35
|
|
)
|
|
|
|
return { chunk, score }
|
|
})
|
|
.filter((entry) => entry.score > 10)
|
|
.sort((left, right) => right.score - left.score)
|
|
.slice(0, 5)
|
|
.map(({ chunk, score }) => ({
|
|
...stripChunkForResponse(chunk),
|
|
matchScore: score,
|
|
}))
|
|
|
|
const needsClarification =
|
|
topChunks.length === 0 ||
|
|
topChunks[0].matchScore < 24 ||
|
|
(manualCandidates[0]
|
|
? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score)
|
|
.confidence < 0.48
|
|
: true)
|
|
|
|
const bestManualFromChunks =
|
|
topChunks.length > 0
|
|
? manualCandidates.find(
|
|
(entry) => entry.manual.manualId === topChunks[0].manualId
|
|
)
|
|
: null
|
|
const bestManual = bestManualFromChunks
|
|
? buildCandidate(bestManualFromChunks.manual, bestManualFromChunks.score)
|
|
: manualCandidates.length > 0
|
|
? buildCandidate(manualCandidates[0].manual, manualCandidates[0].score)
|
|
: null
|
|
|
|
return {
|
|
query,
|
|
bestManual,
|
|
manualCandidates: manualCandidates
|
|
.slice(0, 4)
|
|
.map(({ manual, score }) => buildCandidate(manual, score)),
|
|
topChunks,
|
|
needsClarification,
|
|
isRisky: isRiskyManualQuery(query),
|
|
}
|
|
}
|
|
|
|
async getManualCitationContext(manualId: string, pageNumber?: number) {
|
|
const artifact = await this.getArtifact()
|
|
const manual = artifact.manualById.get(manualId) || null
|
|
const chunks = artifact.chunksByManualId.get(manualId) || []
|
|
const citations = chunks
|
|
.filter((chunk) =>
|
|
typeof pageNumber === "number" ? chunk.pageNumber === pageNumber : true
|
|
)
|
|
.slice(0, 5)
|
|
.map(stripChunkForResponse)
|
|
|
|
return {
|
|
manual: manual ? buildCandidate(manual, 1) : null,
|
|
citations,
|
|
}
|
|
}
|
|
|
|
private async getArtifact() {
|
|
if (!this.artifactPromise) {
|
|
this.artifactPromise = buildArtifact()
|
|
}
|
|
|
|
return await this.artifactPromise
|
|
}
|
|
|
|
resetCache() {
|
|
this.artifactPromise = null
|
|
}
|
|
}
|
|
|
|
async function buildArtifact(): Promise<ManualKnowledgeArtifact> {
|
|
const sharedArtifact = await loadSharedPlatformArtifact()
|
|
if (sharedArtifact) {
|
|
return sharedArtifact
|
|
}
|
|
|
|
const manuals = await loadManualCatalog()
|
|
const manualById = new Map<string, ManualInternal>()
|
|
|
|
for (const manual of manuals) {
|
|
manualById.set(manual.manualId, manual)
|
|
}
|
|
|
|
const chunks: ChunkInternal[] = []
|
|
const chunksByManualId = new Map<string, ChunkInternal[]>()
|
|
const extractedByFilename = await loadExtractedContentByFilename()
|
|
const structuredRecords = await loadStructuredManualRecords()
|
|
const manualsWithStructuredChunks = new Set<string>()
|
|
|
|
for (const record of structuredRecords) {
|
|
const matchedManual = matchStructuredRecordToManual(record, manuals)
|
|
const manual = ensureManualEntry({
|
|
manualById,
|
|
manuals,
|
|
matchedManual,
|
|
fallbackManualId: normalizeIdentifier(record.manualId || "unknown-manual"),
|
|
filename: matchedManual?.filename || `${record.manualId || "manual"}.pdf`,
|
|
manufacturer:
|
|
matchedManual?.manufacturer ||
|
|
humanizeToken(record.manufacturer || "Unknown"),
|
|
category: matchedManual?.category || record.manualType || "manual",
|
|
model: normalizeNullable(record.model),
|
|
manualType: normalizeManualType(record.manualType),
|
|
manualUrl: matchedManual?.manualUrl || null,
|
|
thumbnailUrl: matchedManual?.thumbnailUrl || null,
|
|
})
|
|
|
|
const structuredChunks = buildStructuredChunks(record, manual)
|
|
if (structuredChunks.length > 0) {
|
|
manualsWithStructuredChunks.add(manual.manualId)
|
|
addChunks(chunks, chunksByManualId, structuredChunks)
|
|
}
|
|
}
|
|
|
|
for (const extracted of extractedByFilename.values()) {
|
|
const manual = matchExtractedRecordToManual(extracted, manuals)
|
|
const sourceManual = manual
|
|
? ensureManualEntry({
|
|
manualById,
|
|
manuals,
|
|
matchedManual: manual,
|
|
fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "")),
|
|
filename: manual.filename,
|
|
manufacturer: manual.manufacturer,
|
|
category: manual.category,
|
|
model: manual.model,
|
|
manualType: manual.manualType,
|
|
manualUrl: manual.manualUrl,
|
|
thumbnailUrl: manual.thumbnailUrl,
|
|
})
|
|
: ensureManualEntry({
|
|
manualById,
|
|
manuals,
|
|
matchedManual: null,
|
|
fallbackManualId: normalizeIdentifier(stripExtension(extracted.filename || "manual")),
|
|
filename: extracted.filename || "manual.pdf",
|
|
manufacturer: "Unknown",
|
|
category: "manual",
|
|
model: null,
|
|
manualType: "manual",
|
|
manualUrl: null,
|
|
thumbnailUrl: null,
|
|
})
|
|
|
|
if (manualsWithStructuredChunks.has(sourceManual.manualId)) {
|
|
continue
|
|
}
|
|
|
|
const fallbackChunks = buildExtractedFallbackChunks(extracted, sourceManual)
|
|
if (fallbackChunks.length > 0) {
|
|
addChunks(chunks, chunksByManualId, fallbackChunks)
|
|
}
|
|
}
|
|
|
|
return {
|
|
manuals,
|
|
manualById,
|
|
chunks,
|
|
chunksByManualId,
|
|
}
|
|
}
|
|
|
|
async function loadSharedPlatformArtifact(): Promise<ManualKnowledgeArtifact | null> {
|
|
const platformRoot = resolveManualsPlatformRoot()
|
|
if (!platformRoot) {
|
|
return null
|
|
}
|
|
|
|
const tenantId = getManualsPlatformTenantId()
|
|
const tenantDir = join(platformRoot, "output", "tenants", tenantId)
|
|
const manualsPath = join(tenantDir, "manuals.json")
|
|
const chunksPath = join(tenantDir, "chunks.json")
|
|
|
|
if (!existsSync(manualsPath) || !existsSync(chunksPath)) {
|
|
return null
|
|
}
|
|
|
|
const [manualRecords, chunkRecords] = await Promise.all([
|
|
readJsonFile<SharedPlatformManualRecord[]>(manualsPath),
|
|
readJsonFile<SharedPlatformChunkRecord[]>(chunksPath),
|
|
])
|
|
|
|
const manuals: ManualInternal[] = manualRecords.map((manual) => {
|
|
const searchTerms = [
|
|
manual.filename,
|
|
manual.sourcePath,
|
|
manual.manufacturer,
|
|
manual.category,
|
|
manual.model,
|
|
manual.manualType,
|
|
...(manual.sourceFilenames || []),
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
|
|
return {
|
|
manualId: manual.manualId,
|
|
filename: manual.filename,
|
|
manufacturer: manual.manufacturer,
|
|
category: manual.category || manual.manualType || "manual",
|
|
model: manual.model || null,
|
|
manualType: normalizeManualType(manual.manualType),
|
|
manualUrl: toSiteManualUrl(manual.manualUrl || manual.sourcePath || null),
|
|
thumbnailUrl: toSiteThumbnailUrl(manual.thumbnailUrl || null),
|
|
searchText: normalizeText(searchTerms),
|
|
tokenSet: new Set(tokenize(searchTerms)),
|
|
normalizedManufacturer: canonicalManufacturer(manual.manufacturer),
|
|
}
|
|
})
|
|
|
|
const manualById = new Map<string, ManualInternal>()
|
|
for (const manual of manuals) {
|
|
manualById.set(manual.manualId, manual)
|
|
}
|
|
|
|
const chunks: ChunkInternal[] = chunkRecords.map((chunk) => {
|
|
const manual = manualById.get(chunk.manualId) || null
|
|
const filename = manual?.filename || humanizeToken(chunk.title || chunk.manualId)
|
|
const manufacturer = manual?.manufacturer || chunk.manufacturer
|
|
const manualUrl = manual?.manualUrl || toSiteManualUrl(chunk.manualUrl || null)
|
|
const thumbnailUrl =
|
|
manual?.thumbnailUrl || toSiteThumbnailUrl(chunk.thumbnailUrl || null)
|
|
const searchText = normalizeText(
|
|
[
|
|
filename,
|
|
manufacturer,
|
|
chunk.model,
|
|
chunk.sectionTitle,
|
|
chunk.text,
|
|
chunk.manualType,
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
)
|
|
|
|
return {
|
|
manualId: chunk.manualId,
|
|
filename,
|
|
manufacturer,
|
|
model: chunk.model,
|
|
manualType: normalizeManualType(chunk.manualType),
|
|
pageNumber: chunk.pageNumber,
|
|
sectionTitle: chunk.sectionTitle,
|
|
text: chunk.text,
|
|
manualUrl,
|
|
thumbnailUrl,
|
|
sourceConfidence: clamp(
|
|
chunk.metadataConfidence ?? chunk.overallQuality ?? 0.76
|
|
),
|
|
matchScore: 0,
|
|
citation: buildCitation(filename, manufacturer, chunk.pageNumber),
|
|
searchText,
|
|
tokenSet: new Set(tokenize(searchText)),
|
|
}
|
|
})
|
|
|
|
const chunksByManualId = new Map<string, ChunkInternal[]>()
|
|
for (const chunk of chunks) {
|
|
const existing = chunksByManualId.get(chunk.manualId) || []
|
|
existing.push(chunk)
|
|
chunksByManualId.set(chunk.manualId, existing)
|
|
}
|
|
|
|
return {
|
|
manuals,
|
|
manualById,
|
|
chunks,
|
|
chunksByManualId,
|
|
}
|
|
}
|
|
|
|
function resolveManualsPlatformRoot() {
|
|
const candidates = [
|
|
process.env.MANUALS_PLATFORM_ROOT,
|
|
join(process.cwd(), "..", "manuals-platform"),
|
|
join(process.cwd(), "manuals-platform"),
|
|
].filter(Boolean) as string[]
|
|
|
|
for (const candidate of candidates) {
|
|
if (existsSync(candidate)) {
|
|
return candidate
|
|
}
|
|
}
|
|
|
|
return null
|
|
}
|
|
|
|
function getManualsPlatformTenantId() {
|
|
const value =
|
|
process.env.MANUALS_PLATFORM_TENANT_ID ||
|
|
process.env.SITE_MANUALS_TENANT_ID ||
|
|
DEFAULT_MANUALS_PLATFORM_TENANT_ID
|
|
|
|
return value.trim() || DEFAULT_MANUALS_PLATFORM_TENANT_ID
|
|
}
|
|
|
|
function toSiteManualUrl(value: string | null) {
|
|
if (!value) {
|
|
return null
|
|
}
|
|
|
|
if (/^https?:\/\//i.test(value)) {
|
|
return value
|
|
}
|
|
|
|
const relativePath = value.replace(/^manuals\//i, "").replace(/^\/manuals\//i, "")
|
|
return buildManualAssetUrl(relativePath)
|
|
}
|
|
|
|
function toSiteThumbnailUrl(value: string | null) {
|
|
if (!value) {
|
|
return null
|
|
}
|
|
|
|
if (/^https?:\/\//i.test(value)) {
|
|
return value
|
|
}
|
|
|
|
const relativePath = value
|
|
.replace(/^thumbnails\//i, "")
|
|
.replace(/^\/thumbnails\//i, "")
|
|
return buildThumbnailAssetUrl(relativePath)
|
|
}
|
|
|
|
async function loadManualCatalog() {
|
|
const convexManuals = await listConvexManuals().catch(() => [] as Manual[])
|
|
const filesystemManuals =
|
|
convexManuals.length > 0 ? [] : await scanManuals().catch(() => [] as Manual[])
|
|
|
|
const sourceManuals = convexManuals.length > 0 ? convexManuals : filesystemManuals
|
|
const deduped = new Map<string, ManualInternal>()
|
|
|
|
for (const manual of sourceManuals) {
|
|
const manualId = normalizeIdentifier(manual.path || manual.filename)
|
|
const manufacturer = manual.manufacturer || "Unknown"
|
|
const filename = manual.filename || basename(manual.path)
|
|
const category = manual.category || "manual"
|
|
const model = guessModelFromManual(manual)
|
|
const searchTerms = [
|
|
filename,
|
|
stripExtension(filename),
|
|
manual.path,
|
|
manufacturer,
|
|
category,
|
|
...(manual.searchTerms || []),
|
|
...(manual.commonNames || []),
|
|
...aliasesForManufacturer(manufacturer),
|
|
model || "",
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
|
|
deduped.set(manualId, {
|
|
manualId,
|
|
filename,
|
|
manufacturer,
|
|
category,
|
|
model,
|
|
manualType: normalizeManualType(category),
|
|
manualUrl: getManualUrl(manual),
|
|
thumbnailUrl: getThumbnailUrl(manual),
|
|
searchText: normalizeText(searchTerms),
|
|
tokenSet: new Set(tokenize(searchTerms)),
|
|
normalizedManufacturer: canonicalManufacturer(manufacturer),
|
|
})
|
|
}
|
|
|
|
return Array.from(deduped.values())
|
|
}
|
|
|
|
async function loadStructuredManualRecords() {
|
|
const directory = join(getManualsDataRoot(), ...STRUCTURED_MANUALS_DIR)
|
|
const entries = await readdir(directory, { withFileTypes: true })
|
|
const files = entries
|
|
.filter((entry) => entry.isFile() && entry.name.toLowerCase().endsWith(".json"))
|
|
.map((entry) => entry.name)
|
|
|
|
return await Promise.all(
|
|
files.map(async (filename) => {
|
|
const path = join(directory, filename)
|
|
const parsed = await readJsonFile<StructuredManualRecord>(path)
|
|
return {
|
|
...parsed,
|
|
manualId: parsed.manualId || stripExtension(filename),
|
|
}
|
|
})
|
|
)
|
|
}
|
|
|
|
async function loadExtractedContentByFilename() {
|
|
const path = join(getManualsDataRoot(), ...EXTRACTED_CONTENT_FILE)
|
|
const records = await readJsonFile<ExtractedManualRecord[]>(path)
|
|
const map = new Map<string, ExtractedManualRecord>()
|
|
|
|
for (const record of records) {
|
|
if (!record.filename) {
|
|
continue
|
|
}
|
|
|
|
map.set(normalizeIdentifier(record.filename), record)
|
|
}
|
|
|
|
return map
|
|
}
|
|
|
|
function matchStructuredRecordToManual(
|
|
record: StructuredManualRecord,
|
|
manuals: ManualInternal[]
|
|
) {
|
|
const query = [
|
|
record.manualId,
|
|
record.manufacturer,
|
|
record.model,
|
|
record.manualType,
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
|
|
const prepared = prepareQuery(query, {
|
|
manufacturer: record.manufacturer,
|
|
model: record.model,
|
|
})
|
|
|
|
const best = manuals
|
|
.map((manual) => ({
|
|
manual,
|
|
score: scoreManual(manual, prepared),
|
|
}))
|
|
.sort((left, right) => right.score - left.score)[0]
|
|
|
|
return best && best.score >= 12 ? best.manual : null
|
|
}
|
|
|
|
function matchExtractedRecordToManual(
|
|
record: ExtractedManualRecord,
|
|
manuals: ManualInternal[]
|
|
) {
|
|
const filename = record.filename || ""
|
|
const normalizedFilename = normalizeIdentifier(filename)
|
|
const exactMatch = manuals.find(
|
|
(manual) =>
|
|
normalizeIdentifier(manual.filename) === normalizedFilename ||
|
|
normalizeIdentifier(manual.manualId) === normalizedFilename
|
|
)
|
|
|
|
if (exactMatch) {
|
|
return exactMatch
|
|
}
|
|
|
|
const query = stripExtension(filename)
|
|
const prepared = prepareQuery(query)
|
|
const best = manuals
|
|
.map((manual) => ({
|
|
manual,
|
|
score: scoreManual(manual, prepared),
|
|
}))
|
|
.sort((left, right) => right.score - left.score)[0]
|
|
|
|
return best && best.score >= 10 ? best.manual : null
|
|
}
|
|
|
|
function buildStructuredChunks(
|
|
record: StructuredManualRecord,
|
|
manual: ManualInternal
|
|
) {
|
|
const chunks: ChunkInternal[] = []
|
|
const sections = record.content?.sections || []
|
|
const troubleshooting = record.content?.troubleshooting || []
|
|
const parts = record.content?.partsDatabase || []
|
|
|
|
for (const section of sections) {
|
|
const text = cleanChunkText(section.text || "")
|
|
if (!isUsefulChunkText(text)) {
|
|
continue
|
|
}
|
|
|
|
chunks.push(
|
|
makeChunk({
|
|
manual,
|
|
pageNumber: section.pageNumber ?? null,
|
|
sectionTitle: cleanSectionTitle(section.title),
|
|
text,
|
|
sourceConfidence: 0.92,
|
|
})
|
|
)
|
|
}
|
|
|
|
for (const item of troubleshooting) {
|
|
const problem = cleanChunkText(item.problem || "")
|
|
const solution = cleanChunkText(item.solution || "")
|
|
const text = cleanChunkText(
|
|
[problem ? `Problem: ${problem}` : "", solution ? `Likely cause or solution: ${solution}` : ""]
|
|
.filter(Boolean)
|
|
.join("\n")
|
|
)
|
|
|
|
if (!isUsefulChunkText(text)) {
|
|
continue
|
|
}
|
|
|
|
chunks.push(
|
|
makeChunk({
|
|
manual,
|
|
pageNumber: item.pageNumber ?? null,
|
|
sectionTitle: problem ? `Troubleshooting: ${problem}` : "Troubleshooting",
|
|
text,
|
|
sourceConfidence: 0.95,
|
|
})
|
|
)
|
|
}
|
|
|
|
const partChunks = buildPartChunks(parts, manual, 0.8)
|
|
addChunks(chunks, new Map<string, ChunkInternal[]>(), partChunks)
|
|
|
|
return chunks
|
|
}
|
|
|
|
function buildExtractedFallbackChunks(
|
|
record: ExtractedManualRecord,
|
|
manual: ManualInternal
|
|
) {
|
|
const chunks: ChunkInternal[] = []
|
|
const pages = record.text?.pages || []
|
|
|
|
for (const page of pages) {
|
|
const text = cleanChunkText(page.text || "")
|
|
if (!isUsefulChunkText(text)) {
|
|
continue
|
|
}
|
|
|
|
chunks.push(
|
|
makeChunk({
|
|
manual,
|
|
pageNumber: page.pageNumber ?? null,
|
|
sectionTitle: page.pageNumber ? `Page ${page.pageNumber}` : "Manual page",
|
|
text,
|
|
sourceConfidence: 0.72,
|
|
})
|
|
)
|
|
}
|
|
|
|
const partChunks = buildExtractedPartListChunks(record.partsLists || [], manual)
|
|
addChunks(chunks, new Map<string, ChunkInternal[]>(), partChunks)
|
|
|
|
return chunks
|
|
}
|
|
|
|
function buildPartChunks(
|
|
parts: StructuredPart[],
|
|
manual: ManualInternal,
|
|
sourceConfidence: number
|
|
) {
|
|
const partsByPage = new Map<number, string[]>()
|
|
|
|
for (const part of parts) {
|
|
const partNumber = cleanChunkText(part.partNumber || "")
|
|
const description = cleanChunkText(part.description || "")
|
|
if (partNumber.length < 2 && description.length < 6) {
|
|
continue
|
|
}
|
|
|
|
const pageNumber = part.pageNumber ?? 0
|
|
const summary = description
|
|
? `Part ${partNumber}: ${description}`
|
|
: `Part ${partNumber}`
|
|
|
|
const bucket = partsByPage.get(pageNumber) || []
|
|
if (bucket.length < 8) {
|
|
bucket.push(summary)
|
|
partsByPage.set(pageNumber, bucket)
|
|
}
|
|
}
|
|
|
|
return Array.from(partsByPage.entries()).map(([pageNumber, summaries]) =>
|
|
makeChunk({
|
|
manual,
|
|
pageNumber: pageNumber || null,
|
|
sectionTitle: "Parts reference",
|
|
text: summaries.join("\n"),
|
|
sourceConfidence,
|
|
})
|
|
)
|
|
}
|
|
|
|
function buildExtractedPartListChunks(
|
|
partLists: ExtractedPartList[],
|
|
manual: ManualInternal
|
|
) {
|
|
const flattened: StructuredPart[] = []
|
|
|
|
for (const partList of partLists) {
|
|
for (const part of partList.parts || []) {
|
|
flattened.push({
|
|
...part,
|
|
pageNumber: part.pageNumber ?? partList.pageNumber,
|
|
})
|
|
}
|
|
}
|
|
|
|
return buildPartChunks(flattened, manual, 0.76)
|
|
}
|
|
|
|
function addChunks(
|
|
target: ChunkInternal[],
|
|
chunksByManualId: Map<string, ChunkInternal[]>,
|
|
chunks: ChunkInternal[]
|
|
) {
|
|
for (const chunk of chunks) {
|
|
target.push(chunk)
|
|
|
|
const existing = chunksByManualId.get(chunk.manualId) || []
|
|
existing.push(chunk)
|
|
chunksByManualId.set(chunk.manualId, existing)
|
|
}
|
|
}
|
|
|
|
function ensureManualEntry(args: {
|
|
manualById: Map<string, ManualInternal>
|
|
manuals: ManualInternal[]
|
|
matchedManual: ManualInternal | null
|
|
fallbackManualId: string
|
|
filename: string
|
|
manufacturer: string
|
|
category: string
|
|
model: string | null
|
|
manualType: string
|
|
manualUrl: string | null
|
|
thumbnailUrl: string | null
|
|
}) {
|
|
if (args.matchedManual) {
|
|
return args.matchedManual
|
|
}
|
|
|
|
const manualId = normalizeIdentifier(args.fallbackManualId)
|
|
const existing = args.manualById.get(manualId)
|
|
if (existing) {
|
|
return existing
|
|
}
|
|
|
|
const fallback: ManualInternal = {
|
|
manualId,
|
|
filename: args.filename,
|
|
manufacturer: args.manufacturer,
|
|
category: args.category,
|
|
model: args.model,
|
|
manualType: args.manualType,
|
|
manualUrl: args.manualUrl,
|
|
thumbnailUrl: args.thumbnailUrl,
|
|
searchText: normalizeText(
|
|
[
|
|
args.filename,
|
|
args.manufacturer,
|
|
args.category,
|
|
args.model,
|
|
args.manualType,
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
),
|
|
tokenSet: new Set(
|
|
tokenize(
|
|
[
|
|
args.filename,
|
|
args.manufacturer,
|
|
args.category,
|
|
args.model,
|
|
args.manualType,
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
)
|
|
),
|
|
normalizedManufacturer: canonicalManufacturer(args.manufacturer),
|
|
}
|
|
|
|
args.manuals.push(fallback)
|
|
args.manualById.set(fallback.manualId, fallback)
|
|
return fallback
|
|
}
|
|
|
|
function prepareQuery(query: string, filters?: ManualKnowledgeFilters): PreparedQuery {
|
|
const normalized = normalizeText(query)
|
|
const tokenSet = new Set(tokenize(normalized))
|
|
const modelTokens = extractModelTokens(normalized)
|
|
const manufacturerFilters = new Set<string>()
|
|
|
|
for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
|
|
for (const alias of aliases) {
|
|
if (normalized.includes(normalizeText(alias))) {
|
|
manufacturerFilters.add(canonical)
|
|
}
|
|
}
|
|
}
|
|
|
|
if (filters?.manufacturer) {
|
|
manufacturerFilters.add(canonicalManufacturer(filters.manufacturer))
|
|
}
|
|
|
|
if (filters?.model) {
|
|
tokenSet.add(normalizeText(filters.model))
|
|
}
|
|
|
|
return {
|
|
normalized,
|
|
tokens: Array.from(tokenSet),
|
|
modelTokens,
|
|
manufacturerFilters: Array.from(manufacturerFilters),
|
|
manualIdFilter: filters?.manualId
|
|
? normalizeIdentifier(filters.manualId)
|
|
: null,
|
|
}
|
|
}
|
|
|
|
function scoreManual(manual: ManualInternal, query: PreparedQuery) {
|
|
if (!query.normalized) {
|
|
return 0
|
|
}
|
|
|
|
let score = 0
|
|
|
|
if (query.manualIdFilter) {
|
|
score += query.manualIdFilter === manual.manualId ? 28 : -12
|
|
}
|
|
|
|
if (query.manufacturerFilters.length > 0) {
|
|
score += query.manufacturerFilters.includes(manual.normalizedManufacturer)
|
|
? 14
|
|
: -6
|
|
}
|
|
|
|
if (manual.searchText.includes(query.normalized)) {
|
|
score += 24
|
|
}
|
|
|
|
for (const token of query.tokens) {
|
|
if (!token) {
|
|
continue
|
|
}
|
|
|
|
if (manual.tokenSet.has(token)) {
|
|
score += query.modelTokens.includes(token) ? 8 : 4
|
|
continue
|
|
}
|
|
|
|
if (token.length >= 4 && manual.searchText.includes(token)) {
|
|
score += 1.5
|
|
}
|
|
}
|
|
|
|
for (const token of query.modelTokens) {
|
|
if (manual.searchText.includes(token)) {
|
|
score += 8
|
|
}
|
|
}
|
|
|
|
if (
|
|
query.normalized.includes("error") ||
|
|
query.normalized.includes("not ") ||
|
|
query.normalized.includes("coin") ||
|
|
query.normalized.includes("bill")
|
|
) {
|
|
if (
|
|
manual.manualType.includes("repair") ||
|
|
manual.manualType.includes("service") ||
|
|
manual.manualType.includes("parts")
|
|
) {
|
|
score += 6
|
|
}
|
|
}
|
|
|
|
return score
|
|
}
|
|
|
|
function scoreChunk(
|
|
chunk: ChunkInternal,
|
|
query: PreparedQuery,
|
|
manualScore: number
|
|
) {
|
|
if (!query.normalized) {
|
|
return manualScore
|
|
}
|
|
|
|
let score = manualScore + chunk.sourceConfidence * 8
|
|
|
|
if (query.manufacturerFilters.length > 0) {
|
|
score += query.manufacturerFilters.includes(canonicalManufacturer(chunk.manufacturer))
|
|
? 6
|
|
: -4
|
|
}
|
|
|
|
if (chunk.searchText.includes(query.normalized)) {
|
|
score += 18
|
|
}
|
|
|
|
for (const token of query.tokens) {
|
|
if (!token) {
|
|
continue
|
|
}
|
|
|
|
if (chunk.tokenSet.has(token)) {
|
|
score += query.modelTokens.includes(token) ? 7 : 3
|
|
continue
|
|
}
|
|
|
|
if (token.length >= 5 && chunk.searchText.includes(token)) {
|
|
score += 1
|
|
}
|
|
}
|
|
|
|
if (
|
|
query.normalized.includes("parts") ||
|
|
query.normalized.includes("part") ||
|
|
query.normalized.includes("bill") ||
|
|
query.normalized.includes("coin")
|
|
) {
|
|
if ((chunk.sectionTitle || "").toLowerCase().includes("parts")) {
|
|
score += 6
|
|
}
|
|
}
|
|
|
|
if (
|
|
query.normalized.includes("error") ||
|
|
query.normalized.includes("not ") ||
|
|
query.normalized.includes("won t") ||
|
|
query.normalized.includes("wont")
|
|
) {
|
|
if ((chunk.sectionTitle || "").toLowerCase().includes("troubleshooting")) {
|
|
score += 5
|
|
}
|
|
}
|
|
|
|
return score
|
|
}
|
|
|
|
function buildCandidate(manual: ManualInternal, score: number): ManualCandidate {
|
|
return {
|
|
manualId: manual.manualId,
|
|
filename: manual.filename,
|
|
manufacturer: manual.manufacturer,
|
|
category: manual.category,
|
|
manualUrl: manual.manualUrl,
|
|
thumbnailUrl: manual.thumbnailUrl,
|
|
score,
|
|
confidence: clamp(score / 38),
|
|
}
|
|
}
|
|
|
|
function makeChunk(args: {
|
|
manual: ManualInternal
|
|
pageNumber: number | null
|
|
sectionTitle: string | null
|
|
text: string
|
|
sourceConfidence: number
|
|
}) {
|
|
const searchText = normalizeText(
|
|
[
|
|
args.manual.filename,
|
|
args.manual.manufacturer,
|
|
args.manual.model,
|
|
args.sectionTitle,
|
|
args.text,
|
|
]
|
|
.filter(Boolean)
|
|
.join(" ")
|
|
)
|
|
|
|
return {
|
|
manualId: args.manual.manualId,
|
|
filename: args.manual.filename,
|
|
manufacturer: args.manual.manufacturer,
|
|
model: args.manual.model,
|
|
manualType: args.manual.manualType,
|
|
pageNumber: args.pageNumber,
|
|
sectionTitle: args.sectionTitle,
|
|
text: args.text,
|
|
manualUrl: args.manual.manualUrl,
|
|
thumbnailUrl: args.manual.thumbnailUrl,
|
|
sourceConfidence: args.sourceConfidence,
|
|
matchScore: 0,
|
|
citation: buildCitation(
|
|
args.manual.filename,
|
|
args.manual.manufacturer,
|
|
args.pageNumber
|
|
),
|
|
searchText,
|
|
tokenSet: new Set(tokenize(searchText)),
|
|
} satisfies ChunkInternal
|
|
}
|
|
|
|
function stripChunkForResponse(chunk: ChunkInternal): ManualKnowledgeChunk {
|
|
return {
|
|
manualId: chunk.manualId,
|
|
filename: chunk.filename,
|
|
manufacturer: chunk.manufacturer,
|
|
model: chunk.model,
|
|
manualType: chunk.manualType,
|
|
pageNumber: chunk.pageNumber,
|
|
sectionTitle: chunk.sectionTitle,
|
|
text: chunk.text,
|
|
manualUrl: chunk.manualUrl,
|
|
thumbnailUrl: chunk.thumbnailUrl,
|
|
sourceConfidence: chunk.sourceConfidence,
|
|
matchScore: chunk.matchScore,
|
|
citation: chunk.citation,
|
|
}
|
|
}
|
|
|
|
function cleanChunkText(value: string) {
|
|
return value
|
|
.replace(/\u00ad/g, "")
|
|
.replace(/\s+/g, " ")
|
|
.trim()
|
|
}
|
|
|
|
function cleanSectionTitle(value?: string) {
|
|
const cleaned = cleanChunkText(value || "")
|
|
return cleaned || null
|
|
}
|
|
|
|
function isUsefulChunkText(text: string) {
|
|
if (!text) {
|
|
return false
|
|
}
|
|
|
|
if (text.length < 70) {
|
|
return /error|vend|coin|bill|cool|stuck|motor|sensor|jam|door|part/i.test(
|
|
text
|
|
)
|
|
}
|
|
|
|
const alphaChars = text.replace(/[^a-z]/gi, "").length
|
|
if (alphaChars < 25) {
|
|
return false
|
|
}
|
|
|
|
if (/^(table of contents|contact us)$/i.test(text)) {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
function normalizeText(value: string) {
|
|
return value
|
|
.normalize("NFKD")
|
|
.replace(/[^a-zA-Z0-9]+/g, " ")
|
|
.toLowerCase()
|
|
.trim()
|
|
}
|
|
|
|
function tokenize(value: string) {
|
|
return normalizeText(value)
|
|
.split(" ")
|
|
.map((token) => token.trim())
|
|
.filter(
|
|
(token) =>
|
|
token.length > 1 &&
|
|
!STOPWORDS.has(token) &&
|
|
!/^page\d*$/.test(token)
|
|
)
|
|
}
|
|
|
|
function normalizeIdentifier(value: string) {
|
|
return normalizeText(stripExtension(value)).replace(/\s+/g, "-")
|
|
}
|
|
|
|
function stripExtension(value: string) {
|
|
return value.replace(/\.pdf$/i, "").replace(/\.json$/i, "")
|
|
}
|
|
|
|
function canonicalManufacturer(value: string) {
|
|
const normalized = normalizeText(value)
|
|
|
|
for (const [canonical, aliases] of Object.entries(MANUFACTURER_ALIASES)) {
|
|
if (
|
|
canonical === normalized ||
|
|
aliases.some((alias) => normalized.includes(normalizeText(alias)))
|
|
) {
|
|
return canonical
|
|
}
|
|
}
|
|
|
|
return normalized
|
|
}
|
|
|
|
function aliasesForManufacturer(value: string) {
|
|
const canonical = canonicalManufacturer(value)
|
|
return MANUFACTURER_ALIASES[canonical] || [value]
|
|
}
|
|
|
|
function guessModelFromManual(manual: Manual) {
|
|
const filename = normalizeText(stripExtension(manual.filename || ""))
|
|
const modelToken = extractModelTokens(filename)[0]
|
|
return modelToken || null
|
|
}
|
|
|
|
function normalizeManualType(value?: string | null) {
|
|
const normalized = normalizeText(value || "")
|
|
if (!normalized) {
|
|
return "manual"
|
|
}
|
|
|
|
if (normalized.includes("part")) {
|
|
return "parts"
|
|
}
|
|
|
|
if (normalized.includes("operator")) {
|
|
return "operator"
|
|
}
|
|
|
|
if (normalized.includes("service")) {
|
|
return "service"
|
|
}
|
|
|
|
return normalized.replace(/\s+/g, "-")
|
|
}
|
|
|
|
function normalizeNullable(value?: string | null) {
|
|
const normalized = cleanChunkText(value || "")
|
|
return normalized || null
|
|
}
|
|
|
|
function extractModelTokens(value: string) {
|
|
const matches =
|
|
normalizeText(value).match(/\b[a-z]*\d{2,}[a-z0-9]*\b/g) || []
|
|
return Array.from(new Set(matches))
|
|
}
|
|
|
|
function looksLikeModelTokenQuery(value: string) {
|
|
return extractModelTokens(value).length > 0
|
|
}
|
|
|
|
function isRiskyManualQuery(value: string) {
|
|
const normalized = normalizeText(value)
|
|
return RISKY_MANUAL_KEYWORDS.some((keyword) =>
|
|
normalized.includes(normalizeText(keyword))
|
|
)
|
|
}
|
|
|
|
function buildCitation(
|
|
filename: string,
|
|
manufacturer: string,
|
|
pageNumber: number | null
|
|
) {
|
|
return `${buildManualLabel(filename, manufacturer)}${
|
|
pageNumber ? `, page ${pageNumber}` : ""
|
|
}`
|
|
}
|
|
|
|
function buildManualLabel(filename: string, manufacturer: string) {
|
|
const stem = humanizeToken(stripExtension(filename))
|
|
const prefix = manufacturer ? `${manufacturer} ` : ""
|
|
return `${prefix}${stem}`.trim()
|
|
}
|
|
|
|
function humanizeToken(value: string) {
|
|
return value
|
|
.replace(/[-_]+/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim()
|
|
}
|
|
|
|
function compareByScore(left: ManualCandidate, right: ManualCandidate) {
|
|
return right.score - left.score
|
|
}
|
|
|
|
function clamp(value: number) {
|
|
return Math.max(0, Math.min(1, value))
|
|
}
|
|
|
|
function truncateText(value: string, maxLength: number) {
|
|
if (value.length <= maxLength) {
|
|
return value
|
|
}
|
|
|
|
return `${value.slice(0, maxLength - 1).trimEnd()}…`
|
|
}
|
|
|
|
async function readJsonFile<T>(path: string) {
|
|
return JSON.parse(await readFile(path, "utf8")) as T
|
|
}
|