Rocky_Mountain_Vending/scripts/generate-internal-links.js

import fs from "fs"
import path from "path"
import { fileURLToPath } from "url"

const __filename = fileURLToPath(import.meta.url)
const __dirname = path.dirname(__filename)

// Get the project root (two levels up from scripts/)
const PROJECT_ROOT = path.join(__dirname, "../..")
const WORDPRESS_DATA_PATH = path.join(
  __dirname,
  "../lib/wordpress-data/processed-content.json"
)
const CONFIG_PATH = path.join(__dirname, "../lib/internal-links-config.json")
const OUTPUT_PATH = path.join(
  __dirname,
  "../lib/wordpress-data/processed-content.json"
)

/**
 * Load WordPress data
 */
function loadWordPressData() {
  try {
    const data = JSON.parse(fs.readFileSync(WORDPRESS_DATA_PATH, "utf8"))
    return data
  } catch (error) {
    console.error("Error loading WordPress data:", error)
    return { pages: [], posts: [] }
  }
}

/**
 * Load configuration
 */
function loadConfig() {
  try {
    const config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"))
    return config
  } catch (error) {
    console.error("Error loading config:", error)
    return {}
  }
}

/**
 * Get route for a page slug
 */
function getRouteForSlug(slug, routeMapping) {
  // Check if slug is in route mapping values
  for (const [route, mappedSlug] of Object.entries(routeMapping)) {
    if (mappedSlug === slug) {
      return `/${route}`
    }
  }
  // Default to slug-based route
  return `/${slug}`
}

/**
 * Extract text content from HTML (removing tags)
 */
function extractTextFromHTML(html) {
  if (!html || typeof html !== "string") return ""
  // Remove script and style tags
  let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
  text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
  // Remove HTML tags but keep text
  text = text.replace(/<[^>]+>/g, " ")
  // Decode HTML entities
  text = text.replace(/&nbsp;/g, " ")
  text = text.replace(/&amp;/g, "&")
  text = text.replace(/&lt;/g, "<")
  text = text.replace(/&gt;/g, ">")
  text = text.replace(/&quot;/g, '"')
  text = text.replace(/&#8217;/g, "'")
  text = text.replace(/&#8211;/g, "-")
  text = text.replace(/&#8212;/g, "—")
  text = text.replace(/&hellip;/g, "...")
  // Clean up whitespace
  text = text.replace(/\s+/g, " ").trim()
  return text
}

/**
 * Check if text already contains a link
 */
function hasExistingLink(html, startIndex, endIndex) {
  const before = html.substring(Math.max(0, startIndex - 100), startIndex)
  const after = html.substring(endIndex, Math.min(html.length, endIndex + 100))
  // Check if there's an <a> tag nearby
  const linkRegex = /<a[^>]*>/i
  return linkRegex.test(before + after)
}

/**
 * Check if position is inside an existing link
 */
function isInsideLink(html, position) {
  const before = html.substring(0, position)
  const openTags = (before.match(/<a[^>]*>/gi) || []).length
  const closeTags = (before.match(/<\/a>/gi) || []).length
  return openTags > closeTags
}

/**
 * Find keyword matches in content
 */
function findKeywordMatches(content, keywords, caseSensitive = false) {
  const matches = []
  const flags = caseSensitive ? "g" : "gi"

  keywords.forEach((keyword) => {
    const regex = new RegExp(
      `\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
      flags
    )
    let match
    while ((match = regex.exec(content)) !== null) {
      matches.push({
        keyword,
        index: match.index,
        length: match[0].length,
        text: match[0],
      })
    }
  })

  // Sort by index
  matches.sort((a, b) => a.index - b.index)
  return matches
}

/**
 * Insert link into HTML content
 */
function insertLink(html, startIndex, endIndex, url, anchorText) {
  const before = html.substring(0, startIndex)
  const after = html.substring(endIndex)
  const link = `<a href="${url}">${anchorText}</a>`
  return before + link + after
}

/**
 * Generate internal links for a page
 */
function generateLinksForPage(page, allPages, config, routeMapping) {
  if (!page.content || typeof page.content !== "string") {
    return { ...page, linksAdded: 0 }
  }

  // Skip excluded pages
  if (config.excludedPages && config.excludedPages.includes(page.slug)) {
    return { ...page, linksAdded: 0 }
  }

  let content = page.content
  let linksAdded = 0
  const linkPositions = []
  const maxLinks = config.linkDensity?.maxLinksPerPage || 10
  const minWordsBetween = config.linkDensity?.minWordsBetweenLinks || 50

  // Build page index for quick lookup
  const pageIndex = {}
  allPages.forEach((p) => {
    pageIndex[p.slug] = {
      slug: p.slug,
      title: p.title,
      url: getRouteForSlug(p.slug, routeMapping),
    }
  })

  // Process priority links first
  if (config.priorityLinks) {
    for (const [targetSlug, linkConfig] of Object.entries(
      config.priorityLinks
    )) {
      if (linksAdded >= maxLinks) break
      if (page.slug === targetSlug) continue // Don't link to self
      if (!pageIndex[targetSlug]) continue

      const keywords = linkConfig.keywords || []
      const maxLinksForThis = linkConfig.maxLinks || 2
      const anchorTexts = linkConfig.anchorText || [pageIndex[targetSlug].title]

      const textContent = extractTextFromHTML(content)
      const matches = findKeywordMatches(textContent, keywords)

      let linksAddedForThis = 0
      for (const match of matches) {
        if (linksAdded >= maxLinks || linksAddedForThis >= maxLinksForThis)
          break

        // Check minimum distance from other links
        const tooClose = linkPositions.some(
          (pos) => Math.abs(pos - match.index) < minWordsBetween * 5 // Rough estimate: 5 chars per word
        )

        if (tooClose) continue

        // Check if already inside a link
        if (isInsideLink(content, match.index)) continue

        // Find the actual position in HTML (accounting for HTML tags)
        const htmlMatch = findKeywordInHTML(
          content,
          match.text,
          match.index,
          textContent
        )
        if (!htmlMatch) continue

        const anchorText = anchorTexts[linksAddedForThis % anchorTexts.length]
        const url = pageIndex[targetSlug].url

        content = insertLink(
          content,
          htmlMatch.start,
          htmlMatch.end,
          url,
          anchorText
        )
        linkPositions.push(htmlMatch.start)
        linksAdded++
        linksAddedForThis++
      }
    }
  }

  // Process keyword mappings
  if (config.keywordMappings && linksAdded < maxLinks) {
    const textContent = extractTextFromHTML(content)

    for (const [keyword, targetSlug] of Object.entries(
      config.keywordMappings
    )) {
      if (linksAdded >= maxLinks) break
      if (page.slug === targetSlug) continue
      if (!pageIndex[targetSlug]) continue

      const matches = findKeywordMatches(textContent, [keyword])

      for (const match of matches) {
        if (linksAdded >= maxLinks) break

        // Check minimum distance
        const tooClose = linkPositions.some(
          (pos) => Math.abs(pos - match.index) < minWordsBetween * 5
        )

        if (tooClose) continue

        // Check if already inside a link
        if (isInsideLink(content, match.index)) continue

        const htmlMatch = findKeywordInHTML(
          content,
          match.text,
          match.index,
          textContent
        )
        if (!htmlMatch) continue

        const targetPage = allPages.find((p) => p.slug === targetSlug)
        const anchorText = targetPage?.title || keyword
        const url = pageIndex[targetSlug].url

        content = insertLink(
          content,
          htmlMatch.start,
          htmlMatch.end,
          url,
          anchorText
        )
        linkPositions.push(htmlMatch.start)
        linksAdded++
        break // Only link first occurrence per keyword
      }
    }
  }

  return {
    ...page,
    content,
    linksAdded,
  }
}

/**
 * Find keyword position in HTML accounting for tags
 * Uses a more reliable approach: search for the keyword in HTML text nodes
 */
function findKeywordInHTML(html, keyword, textIndex, textContent) {
  // Create a regex to find the keyword as a whole word, case-insensitive
  const keywordEscaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")
  const regex = new RegExp(`\\b${keywordEscaped}\\b`, "gi")

  // Find all matches in HTML (but not inside tags or existing links)
  let match
  let matchCount = 0
  const textBeforeMatch = textContent.substring(0, textIndex)
  const wordCountBefore = textBeforeMatch.split(/\s+/).length

  // Reset regex
  regex.lastIndex = 0

  while ((match = regex.exec(html)) !== null) {
    const matchStart = match.index
    const matchEnd = matchStart + match[0].length

    // Check if inside a tag or existing link
    const beforeMatch = html.substring(Math.max(0, matchStart - 50), matchStart)
    const afterMatch = html.substring(
      matchEnd,
      Math.min(html.length, matchEnd + 50)
    )

    // Skip if inside an HTML tag
    if (beforeMatch.includes("<") && !beforeMatch.includes(">")) {
      continue
    }

    // Skip if inside an existing link
    const openLinks = (beforeMatch.match(/<a[^>]*>/gi) || []).length
    const closeLinks = (beforeMatch.match(/<\/a>/gi) || []).length
    if (openLinks > closeLinks) {
      continue
    }

    // Count words before this match in HTML
    const htmlBeforeMatch = html.substring(0, matchStart)
    const textBefore = extractTextFromHTML(htmlBeforeMatch)
    const wordCount = textBefore.split(/\s+/).length

    // If this match is close to our target word count, use it
    if (Math.abs(wordCount - wordCountBefore) < 10) {
      return {
        start: matchStart,
        end: matchEnd,
      }
    }

    matchCount++
    // Limit search to first 20 matches
    if (matchCount > 20) break
  }

  return null
}

/**
 * Main function
 */
function main() {
  console.log("Loading WordPress data...")
  const data = loadWordPressData()

  console.log("Loading configuration...")
  const config = loadConfig()

  // Load route mapping from the page.tsx file (simplified version)
  const routeMapping = {
    "services/repairs": "vending-machine-repairs",
    "services/moving": "vending-machine-repairs",
    "services/parts": "parts-and-support",
    services: "vending-machine-repairs",
    "vending-machines": "vending-machines",
    "vending-machines/machines-we-use": "vending-machines",
    "vending-machines/machines-for-sale": "vending-machines-for-sale-in-utah",
    warehouses:
      "streamlining-snack-and-beverage-access-in-warehouse-environments",
    "auto-repair":
      "enhancing-auto-repair-facilities-with-convenient-vending-solutions",
    gyms: "vending-machine-for-your-gym",
    "community-centers": "vending-for-your-community-centers",
    "dance-studios": "vending-machine-for-your-dance-studio",
    "car-washes": "vending-machines-for-your-car-wash",
    "food-and-beverage/healthy-options": "healthy-vending",
    "food-and-beverage/traditional-options": "traditional-vending",
    "food-and-beverage/suppliers":
      "diverse-vending-options-with-rocky-mountain-vendings-exclusive-wholesale-accounts",
    "about-us": "about-us",
    "about/faqs": "faqs",
  }

  console.log(`Processing ${data.pages.length} pages...`)

  const updatedPages = data.pages.map((page) => {
    const updated = generateLinksForPage(page, data.pages, config, routeMapping)
    if (updated.linksAdded > 0) {
      console.log(`  ✓ ${page.slug}: Added ${updated.linksAdded} link(s)`)
    }
    // Remove linksAdded from final output
    const { linksAdded, ...pageWithoutLinksAdded } = updated
    return pageWithoutLinksAdded
  })

  // Calculate total links from already processed pages
  const totalLinks = updatedPages.reduce((sum, page, idx) => {
    const originalPage = data.pages[idx]
    if (originalPage) {
      const updated = generateLinksForPage(
        originalPage,
        data.pages,
        config,
        routeMapping
      )
      return sum + (updated.linksAdded || 0)
    }
    return sum
  }, 0)

  console.log(`\nTotal links added: ${totalLinks}`)

  // Write updated data
  const updatedData = {
    ...data,
    pages: updatedPages,
  }

  // Create backup if file exists
  if (fs.existsSync(OUTPUT_PATH)) {
    const backupPath = OUTPUT_PATH + ".backup." + Date.now()
    fs.copyFileSync(OUTPUT_PATH, backupPath)
    console.log(`\nBackup created: ${backupPath}`)
  } else {
    console.log("\nNo existing file to backup (creating new file)")
  }

  // Write updated file
  fs.writeFileSync(OUTPUT_PATH, JSON.stringify(updatedData, null, 2))
  console.log(`\nUpdated file written: ${OUTPUT_PATH}`)
  console.log("\nDone!")
}

// Run if called directly
if (
  import.meta.url === `file://${process.argv[1]}` ||
  process.argv[1]?.endsWith("generate-internal-links.js")
) {
  main()
}

export { main, generateLinksForPage, loadWordPressData, loadConfig }