444 lines
12 KiB
JavaScript
444 lines
12 KiB
JavaScript
import fs from "fs"
|
|
import path from "path"
|
|
import { fileURLToPath } from "url"
|
|
|
|
const __filename = fileURLToPath(import.meta.url)
|
|
const __dirname = path.dirname(__filename)
|
|
|
|
// Get the project root (two levels up from scripts/)
|
|
const PROJECT_ROOT = path.join(__dirname, "../..")
|
|
const WORDPRESS_DATA_PATH = path.join(
|
|
__dirname,
|
|
"../lib/wordpress-data/processed-content.json"
|
|
)
|
|
const CONFIG_PATH = path.join(__dirname, "../lib/internal-links-config.json")
|
|
const OUTPUT_PATH = path.join(
|
|
__dirname,
|
|
"../lib/wordpress-data/processed-content.json"
|
|
)
|
|
|
|
/**
|
|
* Load WordPress data
|
|
*/
|
|
function loadWordPressData() {
|
|
try {
|
|
const data = JSON.parse(fs.readFileSync(WORDPRESS_DATA_PATH, "utf8"))
|
|
return data
|
|
} catch (error) {
|
|
console.error("Error loading WordPress data:", error)
|
|
return { pages: [], posts: [] }
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load configuration
|
|
*/
|
|
function loadConfig() {
|
|
try {
|
|
const config = JSON.parse(fs.readFileSync(CONFIG_PATH, "utf8"))
|
|
return config
|
|
} catch (error) {
|
|
console.error("Error loading config:", error)
|
|
return {}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get route for a page slug
|
|
*/
|
|
function getRouteForSlug(slug, routeMapping) {
|
|
// Check if slug is in route mapping values
|
|
for (const [route, mappedSlug] of Object.entries(routeMapping)) {
|
|
if (mappedSlug === slug) {
|
|
return `/${route}`
|
|
}
|
|
}
|
|
// Default to slug-based route
|
|
return `/${slug}`
|
|
}
|
|
|
|
/**
|
|
* Extract text content from HTML (removing tags)
|
|
*/
|
|
function extractTextFromHTML(html) {
|
|
if (!html || typeof html !== "string") return ""
|
|
// Remove script and style tags
|
|
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
|
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
|
|
// Remove HTML tags but keep text
|
|
text = text.replace(/<[^>]+>/g, " ")
|
|
// Decode HTML entities
|
|
text = text.replace(/ /g, " ")
|
|
text = text.replace(/&/g, "&")
|
|
text = text.replace(/</g, "<")
|
|
text = text.replace(/>/g, ">")
|
|
text = text.replace(/"/g, '"')
|
|
text = text.replace(/’/g, "'")
|
|
text = text.replace(/–/g, "-")
|
|
text = text.replace(/—/g, "—")
|
|
text = text.replace(/…/g, "...")
|
|
// Clean up whitespace
|
|
text = text.replace(/\s+/g, " ").trim()
|
|
return text
|
|
}
|
|
|
|
/**
|
|
* Check if text already contains a link
|
|
*/
|
|
function hasExistingLink(html, startIndex, endIndex) {
|
|
const before = html.substring(Math.max(0, startIndex - 100), startIndex)
|
|
const after = html.substring(endIndex, Math.min(html.length, endIndex + 100))
|
|
// Check if there's an <a> tag nearby
|
|
const linkRegex = /<a[^>]*>/i
|
|
return linkRegex.test(before + after)
|
|
}
|
|
|
|
/**
|
|
* Check if position is inside an existing link
|
|
*/
|
|
function isInsideLink(html, position) {
|
|
const before = html.substring(0, position)
|
|
const openTags = (before.match(/<a[^>]*>/gi) || []).length
|
|
const closeTags = (before.match(/<\/a>/gi) || []).length
|
|
return openTags > closeTags
|
|
}
|
|
|
|
/**
|
|
* Find keyword matches in content
|
|
*/
|
|
function findKeywordMatches(content, keywords, caseSensitive = false) {
|
|
const matches = []
|
|
const flags = caseSensitive ? "g" : "gi"
|
|
|
|
keywords.forEach((keyword) => {
|
|
const regex = new RegExp(
|
|
`\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
|
|
flags
|
|
)
|
|
let match
|
|
while ((match = regex.exec(content)) !== null) {
|
|
matches.push({
|
|
keyword,
|
|
index: match.index,
|
|
length: match[0].length,
|
|
text: match[0],
|
|
})
|
|
}
|
|
})
|
|
|
|
// Sort by index
|
|
matches.sort((a, b) => a.index - b.index)
|
|
return matches
|
|
}
|
|
|
|
/**
|
|
* Insert link into HTML content
|
|
*/
|
|
function insertLink(html, startIndex, endIndex, url, anchorText) {
|
|
const before = html.substring(0, startIndex)
|
|
const after = html.substring(endIndex)
|
|
const link = `<a href="${url}">${anchorText}</a>`
|
|
return before + link + after
|
|
}
|
|
|
|
/**
|
|
* Generate internal links for a page
|
|
*/
|
|
function generateLinksForPage(page, allPages, config, routeMapping) {
|
|
if (!page.content || typeof page.content !== "string") {
|
|
return { ...page, linksAdded: 0 }
|
|
}
|
|
|
|
// Skip excluded pages
|
|
if (config.excludedPages && config.excludedPages.includes(page.slug)) {
|
|
return { ...page, linksAdded: 0 }
|
|
}
|
|
|
|
let content = page.content
|
|
let linksAdded = 0
|
|
const linkPositions = []
|
|
const maxLinks = config.linkDensity?.maxLinksPerPage || 10
|
|
const minWordsBetween = config.linkDensity?.minWordsBetweenLinks || 50
|
|
|
|
// Build page index for quick lookup
|
|
const pageIndex = {}
|
|
allPages.forEach((p) => {
|
|
pageIndex[p.slug] = {
|
|
slug: p.slug,
|
|
title: p.title,
|
|
url: getRouteForSlug(p.slug, routeMapping),
|
|
}
|
|
})
|
|
|
|
// Process priority links first
|
|
if (config.priorityLinks) {
|
|
for (const [targetSlug, linkConfig] of Object.entries(
|
|
config.priorityLinks
|
|
)) {
|
|
if (linksAdded >= maxLinks) break
|
|
if (page.slug === targetSlug) continue // Don't link to self
|
|
if (!pageIndex[targetSlug]) continue
|
|
|
|
const keywords = linkConfig.keywords || []
|
|
const maxLinksForThis = linkConfig.maxLinks || 2
|
|
const anchorTexts = linkConfig.anchorText || [pageIndex[targetSlug].title]
|
|
|
|
const textContent = extractTextFromHTML(content)
|
|
const matches = findKeywordMatches(textContent, keywords)
|
|
|
|
let linksAddedForThis = 0
|
|
for (const match of matches) {
|
|
if (linksAdded >= maxLinks || linksAddedForThis >= maxLinksForThis)
|
|
break
|
|
|
|
// Check minimum distance from other links
|
|
const tooClose = linkPositions.some(
|
|
(pos) => Math.abs(pos - match.index) < minWordsBetween * 5 // Rough estimate: 5 chars per word
|
|
)
|
|
|
|
if (tooClose) continue
|
|
|
|
// Check if already inside a link
|
|
if (isInsideLink(content, match.index)) continue
|
|
|
|
// Find the actual position in HTML (accounting for HTML tags)
|
|
const htmlMatch = findKeywordInHTML(
|
|
content,
|
|
match.text,
|
|
match.index,
|
|
textContent
|
|
)
|
|
if (!htmlMatch) continue
|
|
|
|
const anchorText = anchorTexts[linksAddedForThis % anchorTexts.length]
|
|
const url = pageIndex[targetSlug].url
|
|
|
|
content = insertLink(
|
|
content,
|
|
htmlMatch.start,
|
|
htmlMatch.end,
|
|
url,
|
|
anchorText
|
|
)
|
|
linkPositions.push(htmlMatch.start)
|
|
linksAdded++
|
|
linksAddedForThis++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Process keyword mappings
|
|
if (config.keywordMappings && linksAdded < maxLinks) {
|
|
const textContent = extractTextFromHTML(content)
|
|
|
|
for (const [keyword, targetSlug] of Object.entries(
|
|
config.keywordMappings
|
|
)) {
|
|
if (linksAdded >= maxLinks) break
|
|
if (page.slug === targetSlug) continue
|
|
if (!pageIndex[targetSlug]) continue
|
|
|
|
const matches = findKeywordMatches(textContent, [keyword])
|
|
|
|
for (const match of matches) {
|
|
if (linksAdded >= maxLinks) break
|
|
|
|
// Check minimum distance
|
|
const tooClose = linkPositions.some(
|
|
(pos) => Math.abs(pos - match.index) < minWordsBetween * 5
|
|
)
|
|
|
|
if (tooClose) continue
|
|
|
|
// Check if already inside a link
|
|
if (isInsideLink(content, match.index)) continue
|
|
|
|
const htmlMatch = findKeywordInHTML(
|
|
content,
|
|
match.text,
|
|
match.index,
|
|
textContent
|
|
)
|
|
if (!htmlMatch) continue
|
|
|
|
const targetPage = allPages.find((p) => p.slug === targetSlug)
|
|
const anchorText = targetPage?.title || keyword
|
|
const url = pageIndex[targetSlug].url
|
|
|
|
content = insertLink(
|
|
content,
|
|
htmlMatch.start,
|
|
htmlMatch.end,
|
|
url,
|
|
anchorText
|
|
)
|
|
linkPositions.push(htmlMatch.start)
|
|
linksAdded++
|
|
break // Only link first occurrence per keyword
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
...page,
|
|
content,
|
|
linksAdded,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Find keyword position in HTML accounting for tags
|
|
* Uses a more reliable approach: search for the keyword in HTML text nodes
|
|
*/
|
|
function findKeywordInHTML(html, keyword, textIndex, textContent) {
|
|
// Create a regex to find the keyword as a whole word, case-insensitive
|
|
const keywordEscaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")
|
|
const regex = new RegExp(`\\b${keywordEscaped}\\b`, "gi")
|
|
|
|
// Find all matches in HTML (but not inside tags or existing links)
|
|
let match
|
|
let matchCount = 0
|
|
const textBeforeMatch = textContent.substring(0, textIndex)
|
|
const wordCountBefore = textBeforeMatch.split(/\s+/).length
|
|
|
|
// Reset regex
|
|
regex.lastIndex = 0
|
|
|
|
while ((match = regex.exec(html)) !== null) {
|
|
const matchStart = match.index
|
|
const matchEnd = matchStart + match[0].length
|
|
|
|
// Check if inside a tag or existing link
|
|
const beforeMatch = html.substring(Math.max(0, matchStart - 50), matchStart)
|
|
const afterMatch = html.substring(
|
|
matchEnd,
|
|
Math.min(html.length, matchEnd + 50)
|
|
)
|
|
|
|
// Skip if inside an HTML tag
|
|
if (beforeMatch.includes("<") && !beforeMatch.includes(">")) {
|
|
continue
|
|
}
|
|
|
|
// Skip if inside an existing link
|
|
const openLinks = (beforeMatch.match(/<a[^>]*>/gi) || []).length
|
|
const closeLinks = (beforeMatch.match(/<\/a>/gi) || []).length
|
|
if (openLinks > closeLinks) {
|
|
continue
|
|
}
|
|
|
|
// Count words before this match in HTML
|
|
const htmlBeforeMatch = html.substring(0, matchStart)
|
|
const textBefore = extractTextFromHTML(htmlBeforeMatch)
|
|
const wordCount = textBefore.split(/\s+/).length
|
|
|
|
// If this match is close to our target word count, use it
|
|
if (Math.abs(wordCount - wordCountBefore) < 10) {
|
|
return {
|
|
start: matchStart,
|
|
end: matchEnd,
|
|
}
|
|
}
|
|
|
|
matchCount++
|
|
// Limit search to first 20 matches
|
|
if (matchCount > 20) break
|
|
}
|
|
|
|
return null
|
|
}
|
|
|
|
/**
|
|
* Main function
|
|
*/
|
|
function main() {
|
|
console.log("Loading WordPress data...")
|
|
const data = loadWordPressData()
|
|
|
|
console.log("Loading configuration...")
|
|
const config = loadConfig()
|
|
|
|
// Load route mapping from the page.tsx file (simplified version)
|
|
const routeMapping = {
|
|
"services/repairs": "vending-machine-repairs",
|
|
"services/moving": "vending-machine-repairs",
|
|
"services/parts": "parts-and-support",
|
|
services: "vending-machine-repairs",
|
|
"vending-machines": "vending-machines",
|
|
"vending-machines/machines-we-use": "vending-machines",
|
|
"vending-machines/machines-for-sale": "vending-machines-for-sale-in-utah",
|
|
warehouses:
|
|
"streamlining-snack-and-beverage-access-in-warehouse-environments",
|
|
"auto-repair":
|
|
"enhancing-auto-repair-facilities-with-convenient-vending-solutions",
|
|
gyms: "vending-machine-for-your-gym",
|
|
"community-centers": "vending-for-your-community-centers",
|
|
"dance-studios": "vending-machine-for-your-dance-studio",
|
|
"car-washes": "vending-machines-for-your-car-wash",
|
|
"food-and-beverage/healthy-options": "healthy-vending",
|
|
"food-and-beverage/traditional-options": "traditional-vending",
|
|
"food-and-beverage/suppliers":
|
|
"diverse-vending-options-with-rocky-mountain-vendings-exclusive-wholesale-accounts",
|
|
"about-us": "about-us",
|
|
"about/faqs": "faqs",
|
|
}
|
|
|
|
console.log(`Processing ${data.pages.length} pages...`)
|
|
|
|
const updatedPages = data.pages.map((page) => {
|
|
const updated = generateLinksForPage(page, data.pages, config, routeMapping)
|
|
if (updated.linksAdded > 0) {
|
|
console.log(` ✓ ${page.slug}: Added ${updated.linksAdded} link(s)`)
|
|
}
|
|
// Remove linksAdded from final output
|
|
const { linksAdded, ...pageWithoutLinksAdded } = updated
|
|
return pageWithoutLinksAdded
|
|
})
|
|
|
|
// Calculate total links from already processed pages
|
|
const totalLinks = updatedPages.reduce((sum, page, idx) => {
|
|
const originalPage = data.pages[idx]
|
|
if (originalPage) {
|
|
const updated = generateLinksForPage(
|
|
originalPage,
|
|
data.pages,
|
|
config,
|
|
routeMapping
|
|
)
|
|
return sum + (updated.linksAdded || 0)
|
|
}
|
|
return sum
|
|
}, 0)
|
|
|
|
console.log(`\nTotal links added: ${totalLinks}`)
|
|
|
|
// Write updated data
|
|
const updatedData = {
|
|
...data,
|
|
pages: updatedPages,
|
|
}
|
|
|
|
// Create backup if file exists
|
|
if (fs.existsSync(OUTPUT_PATH)) {
|
|
const backupPath = OUTPUT_PATH + ".backup." + Date.now()
|
|
fs.copyFileSync(OUTPUT_PATH, backupPath)
|
|
console.log(`\nBackup created: ${backupPath}`)
|
|
} else {
|
|
console.log("\nNo existing file to backup (creating new file)")
|
|
}
|
|
|
|
// Write updated file
|
|
fs.writeFileSync(OUTPUT_PATH, JSON.stringify(updatedData, null, 2))
|
|
console.log(`\nUpdated file written: ${OUTPUT_PATH}`)
|
|
console.log("\nDone!")
|
|
}
|
|
|
|
// Run if called directly
|
|
if (
|
|
import.meta.url === `file://${process.argv[1]}` ||
|
|
process.argv[1]?.endsWith("generate-internal-links.js")
|
|
) {
|
|
main()
|
|
}
|
|
|
|
export { main, generateLinksForPage, loadWordPressData, loadConfig }
|