import fs from "fs" import path from "path" import { fileURLToPath } from "url" const __filename = fileURLToPath(import.meta.url) const __dirname = path.dirname(__filename) // Get the project root (two levels up from scripts/) const PROJECT_ROOT = path.join(__dirname, "../..") const WORDPRESS_DATA_PATH = path.join( __dirname, "../lib/wordpress-data/processed-content.json" ) const CONFIG_PATH = path.join(__dirname, "../lib/internal-links-config.json") const OUTPUT_PATH = path.join( __dirname, "../lib/wordpress-data/processed-content.json" ) /** * Load WordPress data */ function loadWordPressData() { try { const data = JSON.parse(fs.readFileSync(WORDPRESS_DATA_PATH, "utf8")) return data } catch (error) { console.error("Error loading WordPress data:", error) return { pages: [], posts: [] } } } /** * Load configuration */ function loadConfig() { try { const config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8")) return config } catch (error) { console.error("Error loading config:", error) return {} } } /** * Get route for a page slug */ function getRouteForSlug(slug, routeMapping) { // Check if slug is in route mapping values for (const [route, mappedSlug] of Object.entries(routeMapping)) { if (mappedSlug === slug) { return `/${route}` } } // Default to slug-based route return `/${slug}` } /** * Extract text content from HTML (removing tags) */ function extractTextFromHTML(html) { if (!html || typeof html !== "string") return "" // Remove script and style tags let text = html.replace(/]*>[\s\S]*?<\/script>/gi, "") text = text.replace(/]*>[\s\S]*?<\/style>/gi, "") // Remove HTML tags but keep text text = text.replace(/<[^>]+>/g, " ") // Decode HTML entities text = text.replace(/ /g, " ") text = text.replace(/&/g, "&") text = text.replace(/</g, "<") text = text.replace(/>/g, ">") text = text.replace(/"/g, '"') text = text.replace(/’/g, "'") text = text.replace(/–/g, "-") text = text.replace(/—/g, "—") text = text.replace(/…/g, "...") // Clean up whitespace text = text.replace(/\s+/g, " ").trim() return text } /** * Check if text already contains a link */ function hasExistingLink(html, startIndex, endIndex) { const before = html.substring(Math.max(0, startIndex - 100), startIndex) const after = html.substring(endIndex, Math.min(html.length, endIndex + 100)) // Check if there's an tag nearby const linkRegex = /]*>/i return linkRegex.test(before + after) } /** * Check if position is inside an existing link */ function isInsideLink(html, position) { const before = html.substring(0, position) const openTags = (before.match(/]*>/gi) || []).length const closeTags = (before.match(/<\/a>/gi) || []).length return openTags > closeTags } /** * Find keyword matches in content */ function findKeywordMatches(content, keywords, caseSensitive = false) { const matches = [] const flags = caseSensitive ? "g" : "gi" keywords.forEach((keyword) => { const regex = new RegExp( `\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, flags ) let match while ((match = regex.exec(content)) !== null) { matches.push({ keyword, index: match.index, length: match[0].length, text: match[0], }) } }) // Sort by index matches.sort((a, b) => a.index - b.index) return matches } /** * Insert link into HTML content */ function insertLink(html, startIndex, endIndex, url, anchorText) { const before = html.substring(0, startIndex) const after = html.substring(endIndex) const link = `${anchorText}` return before + link + after } /** * Generate internal links for a page */ function generateLinksForPage(page, allPages, config, routeMapping) { if (!page.content || typeof page.content !== "string") { return { ...page, linksAdded: 0 } } // Skip excluded pages if (config.excludedPages && config.excludedPages.includes(page.slug)) { return { ...page, linksAdded: 0 } } let content = page.content let linksAdded = 0 const linkPositions = [] const maxLinks = config.linkDensity?.maxLinksPerPage || 10 const minWordsBetween = config.linkDensity?.minWordsBetweenLinks || 50 // Build page index for quick lookup const pageIndex = {} allPages.forEach((p) => { pageIndex[p.slug] = { slug: p.slug, title: p.title, url: getRouteForSlug(p.slug, routeMapping), } }) // Process priority links first if (config.priorityLinks) { for (const [targetSlug, linkConfig] of Object.entries( config.priorityLinks )) { if (linksAdded >= maxLinks) break if (page.slug === targetSlug) continue // Don't link to self if (!pageIndex[targetSlug]) continue const keywords = linkConfig.keywords || [] const maxLinksForThis = linkConfig.maxLinks || 2 const anchorTexts = linkConfig.anchorText || [pageIndex[targetSlug].title] const textContent = extractTextFromHTML(content) const matches = findKeywordMatches(textContent, keywords) let linksAddedForThis = 0 for (const match of matches) { if (linksAdded >= maxLinks || linksAddedForThis >= maxLinksForThis) break // Check minimum distance from other links const tooClose = linkPositions.some( (pos) => Math.abs(pos - match.index) < minWordsBetween * 5 // Rough estimate: 5 chars per word ) if (tooClose) continue // Check if already inside a link if (isInsideLink(content, match.index)) continue // Find the actual position in HTML (accounting for HTML tags) const htmlMatch = findKeywordInHTML( content, match.text, match.index, textContent ) if (!htmlMatch) continue const anchorText = anchorTexts[linksAddedForThis % anchorTexts.length] const url = pageIndex[targetSlug].url content = insertLink( content, htmlMatch.start, htmlMatch.end, url, anchorText ) linkPositions.push(htmlMatch.start) linksAdded++ linksAddedForThis++ } } } // Process keyword mappings if (config.keywordMappings && linksAdded < maxLinks) { const textContent = extractTextFromHTML(content) for (const [keyword, targetSlug] of Object.entries( config.keywordMappings )) { if (linksAdded >= maxLinks) break if (page.slug === targetSlug) continue if (!pageIndex[targetSlug]) continue const matches = findKeywordMatches(textContent, [keyword]) for (const match of matches) { if (linksAdded >= maxLinks) break // Check minimum distance const tooClose = linkPositions.some( (pos) => Math.abs(pos - match.index) < minWordsBetween * 5 ) if (tooClose) continue // Check if already inside a link if (isInsideLink(content, match.index)) continue const htmlMatch = findKeywordInHTML( content, match.text, match.index, textContent ) if (!htmlMatch) continue const targetPage = allPages.find((p) => p.slug === targetSlug) const anchorText = targetPage?.title || keyword const url = pageIndex[targetSlug].url content = insertLink( content, htmlMatch.start, htmlMatch.end, url, anchorText ) linkPositions.push(htmlMatch.start) linksAdded++ break // Only link first occurrence per keyword } } } return { ...page, content, linksAdded, } } /** * Find keyword position in HTML accounting for tags * Uses a more reliable approach: search for the keyword in HTML text nodes */ function findKeywordInHTML(html, keyword, textIndex, textContent) { // Create a regex to find the keyword as a whole word, case-insensitive const keywordEscaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") const regex = new RegExp(`\\b${keywordEscaped}\\b`, "gi") // Find all matches in HTML (but not inside tags or existing links) let match let matchCount = 0 const textBeforeMatch = textContent.substring(0, textIndex) const wordCountBefore = textBeforeMatch.split(/\s+/).length // Reset regex regex.lastIndex = 0 while ((match = regex.exec(html)) !== null) { const matchStart = match.index const matchEnd = matchStart + match[0].length // Check if inside a tag or existing link const beforeMatch = html.substring(Math.max(0, matchStart - 50), matchStart) const afterMatch = html.substring( matchEnd, Math.min(html.length, matchEnd + 50) ) // Skip if inside an HTML tag if (beforeMatch.includes("<") && !beforeMatch.includes(">")) { continue } // Skip if inside an existing link const openLinks = (beforeMatch.match(/]*>/gi) || []).length const closeLinks = (beforeMatch.match(/<\/a>/gi) || []).length if (openLinks > closeLinks) { continue } // Count words before this match in HTML const htmlBeforeMatch = html.substring(0, matchStart) const textBefore = extractTextFromHTML(htmlBeforeMatch) const wordCount = textBefore.split(/\s+/).length // If this match is close to our target word count, use it if (Math.abs(wordCount - wordCountBefore) < 10) { return { start: matchStart, end: matchEnd, } } matchCount++ // Limit search to first 20 matches if (matchCount > 20) break } return null } /** * Main function */ function main() { console.log("Loading WordPress data...") const data = loadWordPressData() console.log("Loading configuration...") const config = loadConfig() // Load route mapping from the page.tsx file (simplified version) const routeMapping = { "services/repairs": "vending-machine-repairs", "services/moving": "vending-machine-repairs", "services/parts": "parts-and-support", services: "vending-machine-repairs", "vending-machines": "vending-machines", "vending-machines/machines-we-use": "vending-machines", "vending-machines/machines-for-sale": "vending-machines-for-sale-in-utah", warehouses: "streamlining-snack-and-beverage-access-in-warehouse-environments", "auto-repair": "enhancing-auto-repair-facilities-with-convenient-vending-solutions", gyms: "vending-machine-for-your-gym", "community-centers": "vending-for-your-community-centers", "dance-studios": "vending-machine-for-your-dance-studio", "car-washes": "vending-machines-for-your-car-wash", "food-and-beverage/healthy-options": "healthy-vending", "food-and-beverage/traditional-options": "traditional-vending", "food-and-beverage/suppliers": "diverse-vending-options-with-rocky-mountain-vendings-exclusive-wholesale-accounts", "about-us": "about-us", "about/faqs": "faqs", } console.log(`Processing ${data.pages.length} pages...`) const updatedPages = data.pages.map((page) => { const updated = generateLinksForPage(page, data.pages, config, routeMapping) if (updated.linksAdded > 0) { console.log(` ✓ ${page.slug}: Added ${updated.linksAdded} link(s)`) } // Remove linksAdded from final output const { linksAdded, ...pageWithoutLinksAdded } = updated return pageWithoutLinksAdded }) // Calculate total links from already processed pages const totalLinks = updatedPages.reduce((sum, page, idx) => { const originalPage = data.pages[idx] if (originalPage) { const updated = generateLinksForPage( originalPage, data.pages, config, routeMapping ) return sum + (updated.linksAdded || 0) } return sum }, 0) console.log(`\nTotal links added: ${totalLinks}`) // Write updated data const updatedData = { ...data, pages: updatedPages, } // Create backup if file exists if (fs.existsSync(OUTPUT_PATH)) { const backupPath = OUTPUT_PATH + ".backup." + Date.now() fs.copyFileSync(OUTPUT_PATH, backupPath) console.log(`\nBackup created: ${backupPath}`) } else { console.log("\nNo existing file to backup (creating new file)") } // Write updated file fs.writeFileSync(OUTPUT_PATH, JSON.stringify(updatedData, null, 2)) console.log(`\nUpdated file written: ${OUTPUT_PATH}`) console.log("\nDone!") } // Run if called directly if ( import.meta.url === `file://${process.argv[1]}` || process.argv[1]?.endsWith("generate-internal-links.js") ) { main() } export { main, generateLinksForPage, loadWordPressData, loadConfig }