/**
* Extracts the table of contents from AsciiDoc HTML output
* Returns the TOC HTML and the content HTML without the TOC
*/
export function extractTOC(html: string): { toc: string; contentWithoutTOC: string } {
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
let tocContent = '';
let contentWithoutTOC = html;
// Find the start of the TOC div - try multiple patterns
const tocStartPatterns = [
/
]*>/i,
/
]*>/i,
/
]*>/i,
/', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '
or
if (tocFullHTML.endsWith('
')) {
innerEnd -= 6;
} else if (tocFullHTML.endsWith('')) {
innerEnd -= 7;
}
tocContent = tocFullHTML.substring(innerStart, innerEnd).trim();
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
tocContent = tocContent.replace(/
]*>.*?<\/div>\s*/gis, '');
tocContent = tocContent.trim();
// Remove the TOC from the content
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx);
}
// Extract just the body content if the HTML includes full document structure
// AsciiDoctor might return full HTML with , , tags
// Check if this is a full HTML document
const isFullDocument = /^\s* tag
const bodyStartMatch = contentWithoutTOC.match(/]*>/i);
if (bodyStartMatch && bodyStartMatch.index !== undefined) {
const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length;
// Find the closing tag by searching backwards from the end
// This is more reliable than regex for nested content
const bodyEndMatch = contentWithoutTOC.lastIndexOf('');
if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) {
contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim();
}
}
}
// Remove any remaining document structure tags that might have slipped through
contentWithoutTOC = contentWithoutTOC
.replace(/]*>/gi, '')
.replace(/<\/html>/gi, '')
.replace(/]*>[\s\S]*?<\/head>/gi, '')
.replace(/]*>/gi, '')
.replace(/<\/body>/gi, '');
// Clean up any extra whitespace
contentWithoutTOC = contentWithoutTOC.trim();
return { toc: tocContent, contentWithoutTOC };
}
/**
* Performs basic HTML sanitization to prevent XSS
*/
export function sanitizeHTML(html: string): string {
// Remove script tags and their content
html = html.replace(/