import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Get the project root (two levels up from scripts/) const PROJECT_ROOT = path.join(__dirname, '../..'); const WORDPRESS_DATA_PATH = path.join(__dirname, '../lib/wordpress-data/processed-content.json'); const CONFIG_PATH = path.join(__dirname, '../lib/internal-links-config.json'); const OUTPUT_PATH = path.join(__dirname, '../lib/wordpress-data/processed-content.json'); /** * Load WordPress data */ function loadWordPressData() { try { const data = JSON.parse(fs.readFileSync(WORDPRESS_DATA_PATH, 'utf8')); return data; } catch (error) { console.error('Error loading WordPress data:', error); return { pages: [], posts: [] }; } } /** * Load configuration */ function loadConfig() { try { const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8')); return config; } catch (error) { console.error('Error loading config:', error); return {}; } } /** * Get route for a page slug */ function getRouteForSlug(slug, routeMapping) { // Check if slug is in route mapping values for (const [route, mappedSlug] of Object.entries(routeMapping)) { if (mappedSlug === slug) { return `/${route}`; } } // Default to slug-based route return `/${slug}`; } /** * Extract text content from HTML (removing tags) */ function extractTextFromHTML(html) { if (!html || typeof html !== 'string') return ''; // Remove script and style tags let text = html.replace(/]*>[\s\S]*?<\/script>/gi, ''); text = text.replace(/]*>[\s\S]*?<\/style>/gi, ''); // Remove HTML tags but keep text text = text.replace(/<[^>]+>/g, ' '); // Decode HTML entities text = text.replace(/ /g, ' '); text = text.replace(/&/g, '&'); text = text.replace(/</g, '<'); text = text.replace(/>/g, '>'); text = text.replace(/"/g, '"'); text = text.replace(/’/g, "'"); text = text.replace(/–/g, '-'); text = text.replace(/—/g, '—'); text = text.replace(/…/g, '...'); // Clean up whitespace text = text.replace(/\s+/g, ' ').trim(); return text; } /** * Check if text already contains a link */ function hasExistingLink(html, startIndex, endIndex) { const before = html.substring(Math.max(0, startIndex - 100), startIndex); const after = html.substring(endIndex, Math.min(html.length, endIndex + 100)); // Check if there's an tag nearby const linkRegex = /]*>/i; return linkRegex.test(before + after); } /** * Check if position is inside an existing link */ function isInsideLink(html, position) { const before = html.substring(0, position); const openTags = (before.match(/]*>/gi) || []).length; const closeTags = (before.match(/<\/a>/gi) || []).length; return openTags > closeTags; } /** * Find keyword matches in content */ function findKeywordMatches(content, keywords, caseSensitive = false) { const matches = []; const flags = caseSensitive ? 'g' : 'gi'; keywords.forEach(keyword => { const regex = new RegExp(`\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, flags); let match; while ((match = regex.exec(content)) !== null) { matches.push({ keyword, index: match.index, length: match[0].length, text: match[0] }); } }); // Sort by index matches.sort((a, b) => a.index - b.index); return matches; } /** * Insert link into HTML content */ function insertLink(html, startIndex, endIndex, url, anchorText) { const before = html.substring(0, startIndex); const after = html.substring(endIndex); const link = `${anchorText}`; return before + link + after; } /** * Generate internal links for a page */ function generateLinksForPage(page, allPages, config, routeMapping) { if (!page.content || typeof page.content !== 'string') { return { ...page, linksAdded: 0 }; } // Skip excluded pages if (config.excludedPages && config.excludedPages.includes(page.slug)) { return { ...page, linksAdded: 0 }; } let content = page.content; let linksAdded = 0; const linkPositions = []; const maxLinks = config.linkDensity?.maxLinksPerPage || 10; const minWordsBetween = config.linkDensity?.minWordsBetweenLinks || 50; // Build page index for quick lookup const pageIndex = {}; allPages.forEach(p => { pageIndex[p.slug] = { slug: p.slug, title: p.title, url: getRouteForSlug(p.slug, routeMapping) }; }); // Process priority links first if (config.priorityLinks) { for (const [targetSlug, linkConfig] of Object.entries(config.priorityLinks)) { if (linksAdded >= maxLinks) break; if (page.slug === targetSlug) continue; // Don't link to self if (!pageIndex[targetSlug]) continue; const keywords = linkConfig.keywords || []; const maxLinksForThis = linkConfig.maxLinks || 2; const anchorTexts = linkConfig.anchorText || [pageIndex[targetSlug].title]; const textContent = extractTextFromHTML(content); const matches = findKeywordMatches(textContent, keywords); let linksAddedForThis = 0; for (const match of matches) { if (linksAdded >= maxLinks || linksAddedForThis >= maxLinksForThis) break; // Check minimum distance from other links const tooClose = linkPositions.some(pos => Math.abs(pos - match.index) < minWordsBetween * 5 // Rough estimate: 5 chars per word ); if (tooClose) continue; // Check if already inside a link if (isInsideLink(content, match.index)) continue; // Find the actual position in HTML (accounting for HTML tags) const htmlMatch = findKeywordInHTML(content, match.text, match.index, textContent); if (!htmlMatch) continue; const anchorText = anchorTexts[linksAddedForThis % anchorTexts.length]; const url = pageIndex[targetSlug].url; content = insertLink(content, htmlMatch.start, htmlMatch.end, url, anchorText); linkPositions.push(htmlMatch.start); linksAdded++; linksAddedForThis++; } } } // Process keyword mappings if (config.keywordMappings && linksAdded < maxLinks) { const textContent = extractTextFromHTML(content); for (const [keyword, targetSlug] of Object.entries(config.keywordMappings)) { if (linksAdded >= maxLinks) break; if (page.slug === targetSlug) continue; if (!pageIndex[targetSlug]) continue; const matches = findKeywordMatches(textContent, [keyword]); for (const match of matches) { if (linksAdded >= maxLinks) break; // Check minimum distance const tooClose = linkPositions.some(pos => Math.abs(pos - match.index) < minWordsBetween * 5 ); if (tooClose) continue; // Check if already inside a link if (isInsideLink(content, match.index)) continue; const htmlMatch = findKeywordInHTML(content, match.text, match.index, textContent); if (!htmlMatch) continue; const targetPage = allPages.find(p => p.slug === targetSlug); const anchorText = targetPage?.title || keyword; const url = pageIndex[targetSlug].url; content = insertLink(content, htmlMatch.start, htmlMatch.end, url, anchorText); linkPositions.push(htmlMatch.start); linksAdded++; break; // Only link first occurrence per keyword } } } return { ...page, content, linksAdded }; } /** * Find keyword position in HTML accounting for tags * Uses a more reliable approach: search for the keyword in HTML text nodes */ function findKeywordInHTML(html, keyword, textIndex, textContent) { // Create a regex to find the keyword as a whole word, case-insensitive const keywordEscaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const regex = new RegExp(`\\b${keywordEscaped}\\b`, 'gi'); // Find all matches in HTML (but not inside tags or existing links) let match; let matchCount = 0; const textBeforeMatch = textContent.substring(0, textIndex); const wordCountBefore = textBeforeMatch.split(/\s+/).length; // Reset regex regex.lastIndex = 0; while ((match = regex.exec(html)) !== null) { const matchStart = match.index; const matchEnd = matchStart + match[0].length; // Check if inside a tag or existing link const beforeMatch = html.substring(Math.max(0, matchStart - 50), matchStart); const afterMatch = html.substring(matchEnd, Math.min(html.length, matchEnd + 50)); // Skip if inside an HTML tag if (beforeMatch.includes('<') && !beforeMatch.includes('>')) { continue; } // Skip if inside an existing link const openLinks = (beforeMatch.match(/]*>/gi) || []).length; const closeLinks = (beforeMatch.match(/<\/a>/gi) || []).length; if (openLinks > closeLinks) { continue; } // Count words before this match in HTML const htmlBeforeMatch = html.substring(0, matchStart); const textBefore = extractTextFromHTML(htmlBeforeMatch); const wordCount = textBefore.split(/\s+/).length; // If this match is close to our target word count, use it if (Math.abs(wordCount - wordCountBefore) < 10) { return { start: matchStart, end: matchEnd }; } matchCount++; // Limit search to first 20 matches if (matchCount > 20) break; } return null; } /** * Main function */ function main() { console.log('Loading WordPress data...'); const data = loadWordPressData(); console.log('Loading configuration...'); const config = loadConfig(); // Load route mapping from the page.tsx file (simplified version) const routeMapping = { 'services/repairs': 'vending-machine-repairs', 'services/moving': 'vending-machine-repairs', 'services/parts': 'parts-and-support', 'services': 'vending-machine-repairs', 'vending-machines': 'vending-machines', 'vending-machines/machines-we-use': 'vending-machines', 'vending-machines/machines-for-sale': 'vending-machines-for-sale-in-utah', 'warehouses': 'streamlining-snack-and-beverage-access-in-warehouse-environments', 'auto-repair': 'enhancing-auto-repair-facilities-with-convenient-vending-solutions', 'gyms': 'vending-machine-for-your-gym', 'community-centers': 'vending-for-your-community-centers', 'dance-studios': 'vending-machine-for-your-dance-studio', 'car-washes': 'vending-machines-for-your-car-wash', 'food-and-beverage/healthy-options': 'healthy-vending', 'food-and-beverage/traditional-options': 'traditional-vending', 'food-and-beverage/suppliers': 'diverse-vending-options-with-rocky-mountain-vendings-exclusive-wholesale-accounts', 'about-us': 'about-us', 'about/faqs': 'faqs', }; console.log(`Processing ${data.pages.length} pages...`); const updatedPages = data.pages.map(page => { const updated = generateLinksForPage(page, data.pages, config, routeMapping); if (updated.linksAdded > 0) { console.log(` ✓ ${page.slug}: Added ${updated.linksAdded} link(s)`); } // Remove linksAdded from final output const { linksAdded, ...pageWithoutLinksAdded } = updated; return pageWithoutLinksAdded; }); // Calculate total links from already processed pages const totalLinks = updatedPages.reduce((sum, page, idx) => { const originalPage = data.pages[idx]; if (originalPage) { const updated = generateLinksForPage(originalPage, data.pages, config, routeMapping); return sum + (updated.linksAdded || 0); } return sum; }, 0); console.log(`\nTotal links added: ${totalLinks}`); // Write updated data const updatedData = { ...data, pages: updatedPages }; // Create backup if file exists if (fs.existsSync(OUTPUT_PATH)) { const backupPath = OUTPUT_PATH + '.backup.' + Date.now(); fs.copyFileSync(OUTPUT_PATH, backupPath); console.log(`\nBackup created: ${backupPath}`); } else { console.log('\nNo existing file to backup (creating new file)'); } // Write updated file fs.writeFileSync(OUTPUT_PATH, JSON.stringify(updatedData, null, 2)); console.log(`\nUpdated file written: ${OUTPUT_PATH}`); console.log('\nDone!'); } // Run if called directly if (import.meta.url === `file://${process.argv[1]}` || process.argv[1]?.endsWith('generate-internal-links.js')) { main(); } export { main, generateLinksForPage, loadWordPressData, loadConfig };