Rocky_Mountain_Vending/scripts/generate-internal-links.js

444 lines
12 KiB
JavaScript

import fs from "fs"
import path from "path"
import { fileURLToPath } from "url"
const __filename = fileURLToPath(import.meta.url)
const __dirname = path.dirname(__filename)
// Get the project root (two levels up from scripts/)
const PROJECT_ROOT = path.join(__dirname, "../..")
const WORDPRESS_DATA_PATH = path.join(
__dirname,
"../lib/wordpress-data/processed-content.json"
)
const CONFIG_PATH = path.join(__dirname, "../lib/internal-links-config.json")
const OUTPUT_PATH = path.join(
__dirname,
"../lib/wordpress-data/processed-content.json"
)
/**
* Load WordPress data
*/
function loadWordPressData() {
try {
const data = JSON.parse(fs.readFileSync(WORDPRESS_DATA_PATH, "utf8"))
return data
} catch (error) {
console.error("Error loading WordPress data:", error)
return { pages: [], posts: [] }
}
}
/**
* Load configuration
*/
function loadConfig() {
try {
const config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"))
return config
} catch (error) {
console.error("Error loading config:", error)
return {}
}
}
/**
* Get route for a page slug
*/
function getRouteForSlug(slug, routeMapping) {
// Check if slug is in route mapping values
for (const [route, mappedSlug] of Object.entries(routeMapping)) {
if (mappedSlug === slug) {
return `/${route}`
}
}
// Default to slug-based route
return `/${slug}`
}
/**
* Extract text content from HTML (removing tags)
*/
function extractTextFromHTML(html) {
if (!html || typeof html !== "string") return ""
// Remove script and style tags
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
// Remove HTML tags but keep text
text = text.replace(/<[^>]+>/g, " ")
// Decode HTML entities
text = text.replace(/&nbsp;/g, " ")
text = text.replace(/&amp;/g, "&")
text = text.replace(/&lt;/g, "<")
text = text.replace(/&gt;/g, ">")
text = text.replace(/&quot;/g, '"')
text = text.replace(/&#8217;/g, "'")
text = text.replace(/&#8211;/g, "-")
text = text.replace(/&#8212;/g, "—")
text = text.replace(/&hellip;/g, "...")
// Clean up whitespace
text = text.replace(/\s+/g, " ").trim()
return text
}
/**
* Check if text already contains a link
*/
function hasExistingLink(html, startIndex, endIndex) {
const before = html.substring(Math.max(0, startIndex - 100), startIndex)
const after = html.substring(endIndex, Math.min(html.length, endIndex + 100))
// Check if there's an <a> tag nearby
const linkRegex = /<a[^>]*>/i
return linkRegex.test(before + after)
}
/**
* Check if position is inside an existing link
*/
function isInsideLink(html, position) {
const before = html.substring(0, position)
const openTags = (before.match(/<a[^>]*>/gi) || []).length
const closeTags = (before.match(/<\/a>/gi) || []).length
return openTags > closeTags
}
/**
* Find keyword matches in content
*/
function findKeywordMatches(content, keywords, caseSensitive = false) {
const matches = []
const flags = caseSensitive ? "g" : "gi"
keywords.forEach((keyword) => {
const regex = new RegExp(
`\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
flags
)
let match
while ((match = regex.exec(content)) !== null) {
matches.push({
keyword,
index: match.index,
length: match[0].length,
text: match[0],
})
}
})
// Sort by index
matches.sort((a, b) => a.index - b.index)
return matches
}
/**
* Insert link into HTML content
*/
function insertLink(html, startIndex, endIndex, url, anchorText) {
const before = html.substring(0, startIndex)
const after = html.substring(endIndex)
const link = `<a href="${url}">${anchorText}</a>`
return before + link + after
}
/**
* Generate internal links for a page
*/
function generateLinksForPage(page, allPages, config, routeMapping) {
if (!page.content || typeof page.content !== "string") {
return { ...page, linksAdded: 0 }
}
// Skip excluded pages
if (config.excludedPages && config.excludedPages.includes(page.slug)) {
return { ...page, linksAdded: 0 }
}
let content = page.content
let linksAdded = 0
const linkPositions = []
const maxLinks = config.linkDensity?.maxLinksPerPage || 10
const minWordsBetween = config.linkDensity?.minWordsBetweenLinks || 50
// Build page index for quick lookup
const pageIndex = {}
allPages.forEach((p) => {
pageIndex[p.slug] = {
slug: p.slug,
title: p.title,
url: getRouteForSlug(p.slug, routeMapping),
}
})
// Process priority links first
if (config.priorityLinks) {
for (const [targetSlug, linkConfig] of Object.entries(
config.priorityLinks
)) {
if (linksAdded >= maxLinks) break
if (page.slug === targetSlug) continue // Don't link to self
if (!pageIndex[targetSlug]) continue
const keywords = linkConfig.keywords || []
const maxLinksForThis = linkConfig.maxLinks || 2
const anchorTexts = linkConfig.anchorText || [pageIndex[targetSlug].title]
const textContent = extractTextFromHTML(content)
const matches = findKeywordMatches(textContent, keywords)
let linksAddedForThis = 0
for (const match of matches) {
if (linksAdded >= maxLinks || linksAddedForThis >= maxLinksForThis)
break
// Check minimum distance from other links
const tooClose = linkPositions.some(
(pos) => Math.abs(pos - match.index) < minWordsBetween * 5 // Rough estimate: 5 chars per word
)
if (tooClose) continue
// Check if already inside a link
if (isInsideLink(content, match.index)) continue
// Find the actual position in HTML (accounting for HTML tags)
const htmlMatch = findKeywordInHTML(
content,
match.text,
match.index,
textContent
)
if (!htmlMatch) continue
const anchorText = anchorTexts[linksAddedForThis % anchorTexts.length]
const url = pageIndex[targetSlug].url
content = insertLink(
content,
htmlMatch.start,
htmlMatch.end,
url,
anchorText
)
linkPositions.push(htmlMatch.start)
linksAdded++
linksAddedForThis++
}
}
}
// Process keyword mappings
if (config.keywordMappings && linksAdded < maxLinks) {
const textContent = extractTextFromHTML(content)
for (const [keyword, targetSlug] of Object.entries(
config.keywordMappings
)) {
if (linksAdded >= maxLinks) break
if (page.slug === targetSlug) continue
if (!pageIndex[targetSlug]) continue
const matches = findKeywordMatches(textContent, [keyword])
for (const match of matches) {
if (linksAdded >= maxLinks) break
// Check minimum distance
const tooClose = linkPositions.some(
(pos) => Math.abs(pos - match.index) < minWordsBetween * 5
)
if (tooClose) continue
// Check if already inside a link
if (isInsideLink(content, match.index)) continue
const htmlMatch = findKeywordInHTML(
content,
match.text,
match.index,
textContent
)
if (!htmlMatch) continue
const targetPage = allPages.find((p) => p.slug === targetSlug)
const anchorText = targetPage?.title || keyword
const url = pageIndex[targetSlug].url
content = insertLink(
content,
htmlMatch.start,
htmlMatch.end,
url,
anchorText
)
linkPositions.push(htmlMatch.start)
linksAdded++
break // Only link first occurrence per keyword
}
}
}
return {
...page,
content,
linksAdded,
}
}
/**
* Find keyword position in HTML accounting for tags
* Uses a more reliable approach: search for the keyword in HTML text nodes
*/
function findKeywordInHTML(html, keyword, textIndex, textContent) {
// Create a regex to find the keyword as a whole word, case-insensitive
const keywordEscaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")
const regex = new RegExp(`\\b${keywordEscaped}\\b`, "gi")
// Find all matches in HTML (but not inside tags or existing links)
let match
let matchCount = 0
const textBeforeMatch = textContent.substring(0, textIndex)
const wordCountBefore = textBeforeMatch.split(/\s+/).length
// Reset regex
regex.lastIndex = 0
while ((match = regex.exec(html)) !== null) {
const matchStart = match.index
const matchEnd = matchStart + match[0].length
// Check if inside a tag or existing link
const beforeMatch = html.substring(Math.max(0, matchStart - 50), matchStart)
const afterMatch = html.substring(
matchEnd,
Math.min(html.length, matchEnd + 50)
)
// Skip if inside an HTML tag
if (beforeMatch.includes("<") && !beforeMatch.includes(">")) {
continue
}
// Skip if inside an existing link
const openLinks = (beforeMatch.match(/<a[^>]*>/gi) || []).length
const closeLinks = (beforeMatch.match(/<\/a>/gi) || []).length
if (openLinks > closeLinks) {
continue
}
// Count words before this match in HTML
const htmlBeforeMatch = html.substring(0, matchStart)
const textBefore = extractTextFromHTML(htmlBeforeMatch)
const wordCount = textBefore.split(/\s+/).length
// If this match is close to our target word count, use it
if (Math.abs(wordCount - wordCountBefore) < 10) {
return {
start: matchStart,
end: matchEnd,
}
}
matchCount++
// Limit search to first 20 matches
if (matchCount > 20) break
}
return null
}
/**
* Main function
*/
function main() {
console.log("Loading WordPress data...")
const data = loadWordPressData()
console.log("Loading configuration...")
const config = loadConfig()
// Load route mapping from the page.tsx file (simplified version)
const routeMapping = {
"services/repairs": "vending-machine-repairs",
"services/moving": "vending-machine-repairs",
"services/parts": "parts-and-support",
services: "vending-machine-repairs",
"vending-machines": "vending-machines",
"vending-machines/machines-we-use": "vending-machines",
"vending-machines/machines-for-sale": "vending-machines-for-sale-in-utah",
warehouses:
"streamlining-snack-and-beverage-access-in-warehouse-environments",
"auto-repair":
"enhancing-auto-repair-facilities-with-convenient-vending-solutions",
gyms: "vending-machine-for-your-gym",
"community-centers": "vending-for-your-community-centers",
"dance-studios": "vending-machine-for-your-dance-studio",
"car-washes": "vending-machines-for-your-car-wash",
"food-and-beverage/healthy-options": "healthy-vending",
"food-and-beverage/traditional-options": "traditional-vending",
"food-and-beverage/suppliers":
"diverse-vending-options-with-rocky-mountain-vendings-exclusive-wholesale-accounts",
"about-us": "about-us",
"about/faqs": "faqs",
}
console.log(`Processing ${data.pages.length} pages...`)
const updatedPages = data.pages.map((page) => {
const updated = generateLinksForPage(page, data.pages, config, routeMapping)
if (updated.linksAdded > 0) {
console.log(`${page.slug}: Added ${updated.linksAdded} link(s)`)
}
// Remove linksAdded from final output
const { linksAdded, ...pageWithoutLinksAdded } = updated
return pageWithoutLinksAdded
})
// Calculate total links from already processed pages
const totalLinks = updatedPages.reduce((sum, page, idx) => {
const originalPage = data.pages[idx]
if (originalPage) {
const updated = generateLinksForPage(
originalPage,
data.pages,
config,
routeMapping
)
return sum + (updated.linksAdded || 0)
}
return sum
}, 0)
console.log(`\nTotal links added: ${totalLinks}`)
// Write updated data
const updatedData = {
...data,
pages: updatedPages,
}
// Create backup if file exists
if (fs.existsSync(OUTPUT_PATH)) {
const backupPath = OUTPUT_PATH + ".backup." + Date.now()
fs.copyFileSync(OUTPUT_PATH, backupPath)
console.log(`\nBackup created: ${backupPath}`)
} else {
console.log("\nNo existing file to backup (creating new file)")
}
// Write updated file
fs.writeFileSync(OUTPUT_PATH, JSON.stringify(updatedData, null, 2))
console.log(`\nUpdated file written: ${OUTPUT_PATH}`)
console.log("\nDone!")
}
// Run if called directly
if (
import.meta.url === `file://${process.argv[1]}` ||
process.argv[1]?.endsWith("generate-internal-links.js")
) {
main()
}
export { main, generateLinksForPage, loadWordPressData, loadConfig }