Rocky_Mountain_Vending/scripts/generate-internal-links.js
DMleadgen 46d973904b
Initial commit: Rocky Mountain Vending website
Next.js website for Rocky Mountain Vending company featuring:
- Product catalog with Stripe integration
- Service areas and parts pages
- Admin dashboard with Clerk authentication
- SEO optimized pages with JSON-LD structured data

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-12 16:22:15 -07:00

395 lines
12 KiB
JavaScript

import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Get the project root (two levels up from scripts/)
const PROJECT_ROOT = path.join(__dirname, '../..');
const WORDPRESS_DATA_PATH = path.join(__dirname, '../lib/wordpress-data/processed-content.json');
const CONFIG_PATH = path.join(__dirname, '../lib/internal-links-config.json');
const OUTPUT_PATH = path.join(__dirname, '../lib/wordpress-data/processed-content.json');
/**
* Load WordPress data
*/
function loadWordPressData() {
try {
const data = JSON.parse(fs.readFileSync(WORDPRESS_DATA_PATH, 'utf8'));
return data;
} catch (error) {
console.error('Error loading WordPress data:', error);
return { pages: [], posts: [] };
}
}
/**
* Load configuration
*/
function loadConfig() {
try {
const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
return config;
} catch (error) {
console.error('Error loading config:', error);
return {};
}
}
/**
* Get route for a page slug
*/
function getRouteForSlug(slug, routeMapping) {
// Check if slug is in route mapping values
for (const [route, mappedSlug] of Object.entries(routeMapping)) {
if (mappedSlug === slug) {
return `/${route}`;
}
}
// Default to slug-based route
return `/${slug}`;
}
/**
* Extract text content from HTML (removing tags)
*/
function extractTextFromHTML(html) {
if (!html || typeof html !== 'string') return '';
// Remove script and style tags
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
// Remove HTML tags but keep text
text = text.replace(/<[^>]+>/g, ' ');
// Decode HTML entities
text = text.replace(/&nbsp;/g, ' ');
text = text.replace(/&amp;/g, '&');
text = text.replace(/&lt;/g, '<');
text = text.replace(/&gt;/g, '>');
text = text.replace(/&quot;/g, '"');
text = text.replace(/&#8217;/g, "'");
text = text.replace(/&#8211;/g, '-');
text = text.replace(/&#8212;/g, '—');
text = text.replace(/&hellip;/g, '...');
// Clean up whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}
/**
* Check if text already contains a link
*/
function hasExistingLink(html, startIndex, endIndex) {
const before = html.substring(Math.max(0, startIndex - 100), startIndex);
const after = html.substring(endIndex, Math.min(html.length, endIndex + 100));
// Check if there's an <a> tag nearby
const linkRegex = /<a[^>]*>/i;
return linkRegex.test(before + after);
}
/**
* Check if position is inside an existing link
*/
function isInsideLink(html, position) {
const before = html.substring(0, position);
const openTags = (before.match(/<a[^>]*>/gi) || []).length;
const closeTags = (before.match(/<\/a>/gi) || []).length;
return openTags > closeTags;
}
/**
* Find keyword matches in content
*/
function findKeywordMatches(content, keywords, caseSensitive = false) {
const matches = [];
const flags = caseSensitive ? 'g' : 'gi';
keywords.forEach(keyword => {
const regex = new RegExp(`\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, flags);
let match;
while ((match = regex.exec(content)) !== null) {
matches.push({
keyword,
index: match.index,
length: match[0].length,
text: match[0]
});
}
});
// Sort by index
matches.sort((a, b) => a.index - b.index);
return matches;
}
/**
* Insert link into HTML content
*/
function insertLink(html, startIndex, endIndex, url, anchorText) {
const before = html.substring(0, startIndex);
const after = html.substring(endIndex);
const link = `<a href="${url}">${anchorText}</a>`;
return before + link + after;
}
/**
* Generate internal links for a page
*/
function generateLinksForPage(page, allPages, config, routeMapping) {
if (!page.content || typeof page.content !== 'string') {
return { ...page, linksAdded: 0 };
}
// Skip excluded pages
if (config.excludedPages && config.excludedPages.includes(page.slug)) {
return { ...page, linksAdded: 0 };
}
let content = page.content;
let linksAdded = 0;
const linkPositions = [];
const maxLinks = config.linkDensity?.maxLinksPerPage || 10;
const minWordsBetween = config.linkDensity?.minWordsBetweenLinks || 50;
// Build page index for quick lookup
const pageIndex = {};
allPages.forEach(p => {
pageIndex[p.slug] = {
slug: p.slug,
title: p.title,
url: getRouteForSlug(p.slug, routeMapping)
};
});
// Process priority links first
if (config.priorityLinks) {
for (const [targetSlug, linkConfig] of Object.entries(config.priorityLinks)) {
if (linksAdded >= maxLinks) break;
if (page.slug === targetSlug) continue; // Don't link to self
if (!pageIndex[targetSlug]) continue;
const keywords = linkConfig.keywords || [];
const maxLinksForThis = linkConfig.maxLinks || 2;
const anchorTexts = linkConfig.anchorText || [pageIndex[targetSlug].title];
const textContent = extractTextFromHTML(content);
const matches = findKeywordMatches(textContent, keywords);
let linksAddedForThis = 0;
for (const match of matches) {
if (linksAdded >= maxLinks || linksAddedForThis >= maxLinksForThis) break;
// Check minimum distance from other links
const tooClose = linkPositions.some(pos =>
Math.abs(pos - match.index) < minWordsBetween * 5 // Rough estimate: 5 chars per word
);
if (tooClose) continue;
// Check if already inside a link
if (isInsideLink(content, match.index)) continue;
// Find the actual position in HTML (accounting for HTML tags)
const htmlMatch = findKeywordInHTML(content, match.text, match.index, textContent);
if (!htmlMatch) continue;
const anchorText = anchorTexts[linksAddedForThis % anchorTexts.length];
const url = pageIndex[targetSlug].url;
content = insertLink(content, htmlMatch.start, htmlMatch.end, url, anchorText);
linkPositions.push(htmlMatch.start);
linksAdded++;
linksAddedForThis++;
}
}
}
// Process keyword mappings
if (config.keywordMappings && linksAdded < maxLinks) {
const textContent = extractTextFromHTML(content);
for (const [keyword, targetSlug] of Object.entries(config.keywordMappings)) {
if (linksAdded >= maxLinks) break;
if (page.slug === targetSlug) continue;
if (!pageIndex[targetSlug]) continue;
const matches = findKeywordMatches(textContent, [keyword]);
for (const match of matches) {
if (linksAdded >= maxLinks) break;
// Check minimum distance
const tooClose = linkPositions.some(pos =>
Math.abs(pos - match.index) < minWordsBetween * 5
);
if (tooClose) continue;
// Check if already inside a link
if (isInsideLink(content, match.index)) continue;
const htmlMatch = findKeywordInHTML(content, match.text, match.index, textContent);
if (!htmlMatch) continue;
const targetPage = allPages.find(p => p.slug === targetSlug);
const anchorText = targetPage?.title || keyword;
const url = pageIndex[targetSlug].url;
content = insertLink(content, htmlMatch.start, htmlMatch.end, url, anchorText);
linkPositions.push(htmlMatch.start);
linksAdded++;
break; // Only link first occurrence per keyword
}
}
}
return {
...page,
content,
linksAdded
};
}
/**
* Find keyword position in HTML accounting for tags
* Uses a more reliable approach: search for the keyword in HTML text nodes
*/
function findKeywordInHTML(html, keyword, textIndex, textContent) {
// Create a regex to find the keyword as a whole word, case-insensitive
const keywordEscaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`\\b${keywordEscaped}\\b`, 'gi');
// Find all matches in HTML (but not inside tags or existing links)
let match;
let matchCount = 0;
const textBeforeMatch = textContent.substring(0, textIndex);
const wordCountBefore = textBeforeMatch.split(/\s+/).length;
// Reset regex
regex.lastIndex = 0;
while ((match = regex.exec(html)) !== null) {
const matchStart = match.index;
const matchEnd = matchStart + match[0].length;
// Check if inside a tag or existing link
const beforeMatch = html.substring(Math.max(0, matchStart - 50), matchStart);
const afterMatch = html.substring(matchEnd, Math.min(html.length, matchEnd + 50));
// Skip if inside an HTML tag
if (beforeMatch.includes('<') && !beforeMatch.includes('>')) {
continue;
}
// Skip if inside an existing link
const openLinks = (beforeMatch.match(/<a[^>]*>/gi) || []).length;
const closeLinks = (beforeMatch.match(/<\/a>/gi) || []).length;
if (openLinks > closeLinks) {
continue;
}
// Count words before this match in HTML
const htmlBeforeMatch = html.substring(0, matchStart);
const textBefore = extractTextFromHTML(htmlBeforeMatch);
const wordCount = textBefore.split(/\s+/).length;
// If this match is close to our target word count, use it
if (Math.abs(wordCount - wordCountBefore) < 10) {
return {
start: matchStart,
end: matchEnd
};
}
matchCount++;
// Limit search to first 20 matches
if (matchCount > 20) break;
}
return null;
}
/**
* Main function
*/
function main() {
console.log('Loading WordPress data...');
const data = loadWordPressData();
console.log('Loading configuration...');
const config = loadConfig();
// Load route mapping from the page.tsx file (simplified version)
const routeMapping = {
'services/repairs': 'vending-machine-repairs',
'services/moving': 'vending-machine-repairs',
'services/parts': 'parts-and-support',
'services': 'vending-machine-repairs',
'vending-machines': 'vending-machines',
'vending-machines/machines-we-use': 'vending-machines',
'vending-machines/machines-for-sale': 'vending-machines-for-sale-in-utah',
'warehouses': 'streamlining-snack-and-beverage-access-in-warehouse-environments',
'auto-repair': 'enhancing-auto-repair-facilities-with-convenient-vending-solutions',
'gyms': 'vending-machine-for-your-gym',
'community-centers': 'vending-for-your-community-centers',
'dance-studios': 'vending-machine-for-your-dance-studio',
'car-washes': 'vending-machines-for-your-car-wash',
'food-and-beverage/healthy-options': 'healthy-vending',
'food-and-beverage/traditional-options': 'traditional-vending',
'food-and-beverage/suppliers': 'diverse-vending-options-with-rocky-mountain-vendings-exclusive-wholesale-accounts',
'about-us': 'about-us',
'about/faqs': 'faqs',
};
console.log(`Processing ${data.pages.length} pages...`);
const updatedPages = data.pages.map(page => {
const updated = generateLinksForPage(page, data.pages, config, routeMapping);
if (updated.linksAdded > 0) {
console.log(`${page.slug}: Added ${updated.linksAdded} link(s)`);
}
// Remove linksAdded from final output
const { linksAdded, ...pageWithoutLinksAdded } = updated;
return pageWithoutLinksAdded;
});
// Calculate total links from already processed pages
const totalLinks = updatedPages.reduce((sum, page, idx) => {
const originalPage = data.pages[idx];
if (originalPage) {
const updated = generateLinksForPage(originalPage, data.pages, config, routeMapping);
return sum + (updated.linksAdded || 0);
}
return sum;
}, 0);
console.log(`\nTotal links added: ${totalLinks}`);
// Write updated data
const updatedData = {
...data,
pages: updatedPages
};
// Create backup if file exists
if (fs.existsSync(OUTPUT_PATH)) {
const backupPath = OUTPUT_PATH + '.backup.' + Date.now();
fs.copyFileSync(OUTPUT_PATH, backupPath);
console.log(`\nBackup created: ${backupPath}`);
} else {
console.log('\nNo existing file to backup (creating new file)');
}
// Write updated file
fs.writeFileSync(OUTPUT_PATH, JSON.stringify(updatedData, null, 2));
console.log(`\nUpdated file written: ${OUTPUT_PATH}`);
console.log('\nDone!');
}
// Run if called directly
if (import.meta.url === `file://${process.argv[1]}` || process.argv[1]?.endsWith('generate-internal-links.js')) {
main();
}
export { main, generateLinksForPage, loadWordPressData, loadConfig };