Rocky_Mountain_Vending/scripts/generate-internal-links.js

import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

// Get the project root (two levels up from scripts/)
const PROJECT_ROOT = path.join(__dirname, '../..');
const WORDPRESS_DATA_PATH = path.join(__dirname, '../lib/wordpress-data/processed-content.json');
const CONFIG_PATH = path.join(__dirname, '../lib/internal-links-config.json');
const OUTPUT_PATH = path.join(__dirname, '../lib/wordpress-data/processed-content.json');

/**
 * Load WordPress data
 */
function loadWordPressData() {
  try {
    const data = JSON.parse(fs.readFileSync(WORDPRESS_DATA_PATH, 'utf8'));
    return data;
  } catch (error) {
    console.error('Error loading WordPress data:', error);
    return { pages: [], posts: [] };
  }
}

/**
 * Load configuration
 */
function loadConfig() {
  try {
    const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
    return config;
  } catch (error) {
    console.error('Error loading config:', error);
    return {};
  }
}

/**
 * Get route for a page slug
 */
function getRouteForSlug(slug, routeMapping) {
  // Check if slug is in route mapping values
  for (const [route, mappedSlug] of Object.entries(routeMapping)) {
    if (mappedSlug === slug) {
      return `/${route}`;
    }
  }
  // Default to slug-based route
  return `/${slug}`;
}

/**
 * Extract text content from HTML (removing tags)
 */
function extractTextFromHTML(html) {
  if (!html || typeof html !== 'string') return '';
  // Remove script and style tags
  let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
  text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
  // Remove HTML tags but keep text
  text = text.replace(/<[^>]+>/g, ' ');
  // Decode HTML entities
  text = text.replace(/&nbsp;/g, ' ');
  text = text.replace(/&amp;/g, '&');
  text = text.replace(/&lt;/g, '<');
  text = text.replace(/&gt;/g, '>');
  text = text.replace(/&quot;/g, '"');
  text = text.replace(/&#8217;/g, "'");
  text = text.replace(/&#8211;/g, '-');
  text = text.replace(/&#8212;/g, '—');
  text = text.replace(/&hellip;/g, '...');
  // Clean up whitespace
  text = text.replace(/\s+/g, ' ').trim();
  return text;
}

/**
 * Check if text already contains a link
 */
function hasExistingLink(html, startIndex, endIndex) {
  const before = html.substring(Math.max(0, startIndex - 100), startIndex);
  const after = html.substring(endIndex, Math.min(html.length, endIndex + 100));
  // Check if there's an <a> tag nearby
  const linkRegex = /<a[^>]*>/i;
  return linkRegex.test(before + after);
}

/**
 * Check if position is inside an existing link
 */
function isInsideLink(html, position) {
  const before = html.substring(0, position);
  const openTags = (before.match(/<a[^>]*>/gi) || []).length;
  const closeTags = (before.match(/<\/a>/gi) || []).length;
  return openTags > closeTags;
}

/**
 * Find keyword matches in content
 */
function findKeywordMatches(content, keywords, caseSensitive = false) {
  const matches = [];
  const flags = caseSensitive ? 'g' : 'gi';

  keywords.forEach(keyword => {
    const regex = new RegExp(`\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, flags);
    let match;
    while ((match = regex.exec(content)) !== null) {
      matches.push({
        keyword,
        index: match.index,
        length: match[0].length,
        text: match[0]
      });
    }
  });

  // Sort by index
  matches.sort((a, b) => a.index - b.index);
  return matches;
}

/**
 * Insert link into HTML content
 */
function insertLink(html, startIndex, endIndex, url, anchorText) {
  const before = html.substring(0, startIndex);
  const after = html.substring(endIndex);
  const link = `<a href="${url}">${anchorText}</a>`;
  return before + link + after;
}

/**
 * Generate internal links for a page
 */
function generateLinksForPage(page, allPages, config, routeMapping) {
  if (!page.content || typeof page.content !== 'string') {
    return { ...page, linksAdded: 0 };
  }

  // Skip excluded pages
  if (config.excludedPages && config.excludedPages.includes(page.slug)) {
    return { ...page, linksAdded: 0 };
  }

  let content = page.content;
  let linksAdded = 0;
  const linkPositions = [];
  const maxLinks = config.linkDensity?.maxLinksPerPage || 10;
  const minWordsBetween = config.linkDensity?.minWordsBetweenLinks || 50;

  // Build page index for quick lookup
  const pageIndex = {};
  allPages.forEach(p => {
    pageIndex[p.slug] = {
      slug: p.slug,
      title: p.title,
      url: getRouteForSlug(p.slug, routeMapping)
    };
  });

  // Process priority links first
  if (config.priorityLinks) {
    for (const [targetSlug, linkConfig] of Object.entries(config.priorityLinks)) {
      if (linksAdded >= maxLinks) break;
      if (page.slug === targetSlug) continue; // Don't link to self
      if (!pageIndex[targetSlug]) continue;

      const keywords = linkConfig.keywords || [];
      const maxLinksForThis = linkConfig.maxLinks || 2;
      const anchorTexts = linkConfig.anchorText || [pageIndex[targetSlug].title];

      const textContent = extractTextFromHTML(content);
      const matches = findKeywordMatches(textContent, keywords);

      let linksAddedForThis = 0;
      for (const match of matches) {
        if (linksAdded >= maxLinks || linksAddedForThis >= maxLinksForThis) break;

        // Check minimum distance from other links
        const tooClose = linkPositions.some(pos =>
          Math.abs(pos - match.index) < minWordsBetween * 5 // Rough estimate: 5 chars per word
        );

        if (tooClose) continue;

        // Check if already inside a link
        if (isInsideLink(content, match.index)) continue;

        // Find the actual position in HTML (accounting for HTML tags)
        const htmlMatch = findKeywordInHTML(content, match.text, match.index, textContent);
        if (!htmlMatch) continue;

        const anchorText = anchorTexts[linksAddedForThis % anchorTexts.length];
        const url = pageIndex[targetSlug].url;

        content = insertLink(content, htmlMatch.start, htmlMatch.end, url, anchorText);
        linkPositions.push(htmlMatch.start);
        linksAdded++;
        linksAddedForThis++;
      }
    }
  }

  // Process keyword mappings
  if (config.keywordMappings && linksAdded < maxLinks) {
    const textContent = extractTextFromHTML(content);

    for (const [keyword, targetSlug] of Object.entries(config.keywordMappings)) {
      if (linksAdded >= maxLinks) break;
      if (page.slug === targetSlug) continue;
      if (!pageIndex[targetSlug]) continue;

      const matches = findKeywordMatches(textContent, [keyword]);

      for (const match of matches) {
        if (linksAdded >= maxLinks) break;

        // Check minimum distance
        const tooClose = linkPositions.some(pos =>
          Math.abs(pos - match.index) < minWordsBetween * 5
        );

        if (tooClose) continue;

        // Check if already inside a link
        if (isInsideLink(content, match.index)) continue;

        const htmlMatch = findKeywordInHTML(content, match.text, match.index, textContent);
        if (!htmlMatch) continue;

        const targetPage = allPages.find(p => p.slug === targetSlug);
        const anchorText = targetPage?.title || keyword;
        const url = pageIndex[targetSlug].url;

        content = insertLink(content, htmlMatch.start, htmlMatch.end, url, anchorText);
        linkPositions.push(htmlMatch.start);
        linksAdded++;
        break; // Only link first occurrence per keyword
      }
    }
  }

  return {
    ...page,
    content,
    linksAdded
  };
}

/**
 * Find keyword position in HTML accounting for tags
 * Uses a more reliable approach: search for the keyword in HTML text nodes
 */
function findKeywordInHTML(html, keyword, textIndex, textContent) {
  // Create a regex to find the keyword as a whole word, case-insensitive
  const keywordEscaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
  const regex = new RegExp(`\\b${keywordEscaped}\\b`, 'gi');

  // Find all matches in HTML (but not inside tags or existing links)
  let match;
  let matchCount = 0;
  const textBeforeMatch = textContent.substring(0, textIndex);
  const wordCountBefore = textBeforeMatch.split(/\s+/).length;

  // Reset regex
  regex.lastIndex = 0;

  while ((match = regex.exec(html)) !== null) {
    const matchStart = match.index;
    const matchEnd = matchStart + match[0].length;

    // Check if inside a tag or existing link
    const beforeMatch = html.substring(Math.max(0, matchStart - 50), matchStart);
    const afterMatch = html.substring(matchEnd, Math.min(html.length, matchEnd + 50));

    // Skip if inside an HTML tag
    if (beforeMatch.includes('<') && !beforeMatch.includes('>')) {
      continue;
    }

    // Skip if inside an existing link
    const openLinks = (beforeMatch.match(/<a[^>]*>/gi) || []).length;
    const closeLinks = (beforeMatch.match(/<\/a>/gi) || []).length;
    if (openLinks > closeLinks) {
      continue;
    }

    // Count words before this match in HTML
    const htmlBeforeMatch = html.substring(0, matchStart);
    const textBefore = extractTextFromHTML(htmlBeforeMatch);
    const wordCount = textBefore.split(/\s+/).length;

    // If this match is close to our target word count, use it
    if (Math.abs(wordCount - wordCountBefore) < 10) {
      return {
        start: matchStart,
        end: matchEnd
      };
    }

    matchCount++;
    // Limit search to first 20 matches
    if (matchCount > 20) break;
  }

  return null;
}

/**
 * Main function
 */
function main() {
  console.log('Loading WordPress data...');
  const data = loadWordPressData();

  console.log('Loading configuration...');
  const config = loadConfig();

  // Load route mapping from the page.tsx file (simplified version)
  const routeMapping = {
    'services/repairs': 'vending-machine-repairs',
    'services/moving': 'vending-machine-repairs',
    'services/parts': 'parts-and-support',
    'services': 'vending-machine-repairs',
    'vending-machines': 'vending-machines',
    'vending-machines/machines-we-use': 'vending-machines',
    'vending-machines/machines-for-sale': 'vending-machines-for-sale-in-utah',
    'warehouses': 'streamlining-snack-and-beverage-access-in-warehouse-environments',
    'auto-repair': 'enhancing-auto-repair-facilities-with-convenient-vending-solutions',
    'gyms': 'vending-machine-for-your-gym',
    'community-centers': 'vending-for-your-community-centers',
    'dance-studios': 'vending-machine-for-your-dance-studio',
    'car-washes': 'vending-machines-for-your-car-wash',
    'food-and-beverage/healthy-options': 'healthy-vending',
    'food-and-beverage/traditional-options': 'traditional-vending',
    'food-and-beverage/suppliers': 'diverse-vending-options-with-rocky-mountain-vendings-exclusive-wholesale-accounts',
    'about-us': 'about-us',
    'about/faqs': 'faqs',
  };

  console.log(`Processing ${data.pages.length} pages...`);

  const updatedPages = data.pages.map(page => {
    const updated = generateLinksForPage(page, data.pages, config, routeMapping);
    if (updated.linksAdded > 0) {
      console.log(`  ✓ ${page.slug}: Added ${updated.linksAdded} link(s)`);
    }
    // Remove linksAdded from final output
    const { linksAdded, ...pageWithoutLinksAdded } = updated;
    return pageWithoutLinksAdded;
  });

  // Calculate total links from already processed pages
  const totalLinks = updatedPages.reduce((sum, page, idx) => {
    const originalPage = data.pages[idx];
    if (originalPage) {
      const updated = generateLinksForPage(originalPage, data.pages, config, routeMapping);
      return sum + (updated.linksAdded || 0);
    }
    return sum;
  }, 0);

  console.log(`\nTotal links added: ${totalLinks}`);

  // Write updated data
  const updatedData = {
    ...data,
    pages: updatedPages
  };

  // Create backup if file exists
  if (fs.existsSync(OUTPUT_PATH)) {
    const backupPath = OUTPUT_PATH + '.backup.' + Date.now();
    fs.copyFileSync(OUTPUT_PATH, backupPath);
    console.log(`\nBackup created: ${backupPath}`);
  } else {
    console.log('\nNo existing file to backup (creating new file)');
  }

  // Write updated file
  fs.writeFileSync(OUTPUT_PATH, JSON.stringify(updatedData, null, 2));
  console.log(`\nUpdated file written: ${OUTPUT_PATH}`);
  console.log('\nDone!');
}

// Run if called directly
if (import.meta.url === `file://${process.argv[1]}` || process.argv[1]?.endsWith('generate-internal-links.js')) {
  main();
}

export { main, generateLinksForPage, loadWordPressData, loadConfig };