You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

244 lines
8.6 KiB

/**
* Extracts the table of contents from AsciiDoc HTML output
* Returns the TOC HTML and the content HTML without the TOC
*/
export function extractTOC(html: string): { toc: string; contentWithoutTOC: string } {
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
let tocContent = '';
let contentWithoutTOC = html;
// Find the start of the TOC div - try multiple patterns
const tocStartPatterns = [
/<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>/i,
/<div\s+id=["']toc["'][^>]*>/i,
/<div\s+class=["']toc["'][^>]*>/i,
/<nav\s+id=["']toc["'][^>]*>/i,
];
let tocStartIdx = -1;
let tocStartTag = '';
for (const pattern of tocStartPatterns) {
const match = html.match(pattern);
if (match && match.index !== undefined) {
tocStartIdx = match.index;
tocStartTag = match[0];
break;
}
}
if (tocStartIdx === -1) {
// No TOC found
return { toc: '', contentWithoutTOC: html };
}
// Find the matching closing tag by counting div/nav tags
const searchStart = tocStartIdx + tocStartTag.length;
let depth = 1;
let i = searchStart;
while (i < html.length && depth > 0) {
// Look for opening or closing div/nav tags
if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') {
// Check if it's a closing tag
if (i + 5 < html.length && html[i + 4] === '/') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else {
// Opening tag - find the end (handle attributes and self-closing)
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
// Check if it's self-closing (look for /> before the >)
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
}
} else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') {
// Handle opening nav tags
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
} else {
i++;
}
}
if (depth === 0) {
// Found the matching closing tag
const tocEndIdx = i;
// Extract the TOC content (inner HTML)
const tocFullHTML = html.substring(tocStartIdx, tocEndIdx);
// Extract just the inner content (without the outer div tags)
let innerStart = tocStartTag.length;
let innerEnd = tocFullHTML.length;
// Find the last </div> or </nav>
if (tocFullHTML.endsWith('</div>')) {
innerEnd -= 6;
} else if (tocFullHTML.endsWith('</nav>')) {
innerEnd -= 7;
}
tocContent = tocFullHTML.substring(innerStart, innerEnd).trim();
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
tocContent = tocContent.replace(/<div\s+id=["']toctitle["'][^>]*>.*?<\/div>\s*/gis, '');
tocContent = tocContent.trim();
// Remove the TOC from the content
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx);
}
// Extract just the body content if the HTML includes full document structure
// AsciiDoctor might return full HTML with <html>, <head>, <body> tags
// Check if this is a full HTML document
const isFullDocument = /^\s*<!DOCTYPE|^\s*<html/i.test(contentWithoutTOC);
if (isFullDocument) {
// Extract body content using a more robust approach
// Find the opening <body> tag
const bodyStartMatch = contentWithoutTOC.match(/<body[^>]*>/i);
if (bodyStartMatch && bodyStartMatch.index !== undefined) {
const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length;
// Find the closing </body> tag by searching backwards from the end
// This is more reliable than regex for nested content
const bodyEndMatch = contentWithoutTOC.lastIndexOf('</body>');
if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) {
contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim();
}
}
}
// Remove any remaining document structure tags that might have slipped through
contentWithoutTOC = contentWithoutTOC
.replace(/<html[^>]*>/gi, '')
.replace(/<\/html>/gi, '')
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '')
.replace(/<body[^>]*>/gi, '')
.replace(/<\/body>/gi, '');
// Clean up any extra whitespace
contentWithoutTOC = contentWithoutTOC.trim();
return { toc: tocContent, contentWithoutTOC };
}
/**
* Performs basic HTML sanitization to prevent XSS
*/
export function sanitizeHTML(html: string): string {
// Remove script tags and their content
html = html.replace(/<script[^>]*>.*?<\/script>/gis, '');
// Remove event handlers (onclick, onerror, etc.)
html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, '');
// Remove javascript: protocol in links
html = html.replace(/javascript:/gi, '');
// Remove data: URLs that could be dangerous
html = html.replace(/data:\s*text\/html/gi, '');
return html;
}
/**
* Processes HTML links to add target="_blank" to external links
* This function is available for use but not currently called automatically.
* It can be used in post-processing if needed.
*/
export function processLinks(html: string, linkBaseURL: string): string {
// Extract domain from linkBaseURL for comparison
let linkBaseDomain = '';
if (linkBaseURL) {
try {
// Use URL constructor if available (Node.js 10+)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const url = new URLConstructor(linkBaseURL);
linkBaseDomain = url.hostname;
} else {
throw new Error('URL not available');
}
} catch {
// Fallback to simple string parsing if URL constructor fails
const url = linkBaseURL.replace(/^https?:\/\//, '');
const parts = url.split('/');
if (parts.length > 0) {
linkBaseDomain = parts[0];
}
}
}
// Regex to match <a> tags with href attributes
const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g;
return html.replace(linkRegex, (match, before, href, after) => {
// Check if it's an external link (starts with http:// or https://)
const isExternal = href.startsWith('http://') || href.startsWith('https://');
if (isExternal) {
// Check if it's pointing to our own domain
if (linkBaseDomain) {
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const hrefUrl = new URLConstructor(href);
if (hrefUrl.hostname === linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
} else {
throw new Error('URL not available');
}
} catch {
// If URL parsing fails, use simple string check
if (href.includes(linkBaseDomain)) {
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
}
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
if (!match.includes('target=')) {
if (!match.includes('rel=')) {
return match.replace('>', ' target="_blank" rel="noopener noreferrer">');
} else {
// Update existing rel attribute to include noopener if not present
const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => {
if (!relValue.includes('noopener')) {
return `rel="${relValue} noopener noreferrer"`;
}
return relMatch;
});
return updatedMatch.replace('>', ' target="_blank">');
}
}
} else {
// Local/relative link - ensure it opens in same tab (remove target if present)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
return match;
});
}