/** * Extracts the table of contents from AsciiDoc HTML output * Returns the TOC HTML and the content HTML without the TOC */ export function extractTOC(html: string): { toc: string; contentWithoutTOC: string } { // AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc" let tocContent = ''; let contentWithoutTOC = html; // Find the start of the TOC div - try multiple patterns const tocStartPatterns = [ /]*>/i, /]*>/i, /]*>/i, /]*>/i, ]; let tocStartIdx = -1; let tocStartTag = ''; for (const pattern of tocStartPatterns) { const match = html.match(pattern); if (match && match.index !== undefined) { tocStartIdx = match.index; tocStartTag = match[0]; break; } } if (tocStartIdx === -1) { // No TOC found return { toc: '', contentWithoutTOC: html }; } // Find the matching closing tag by counting div/nav tags const searchStart = tocStartIdx + tocStartTag.length; let depth = 1; let i = searchStart; while (i < html.length && depth > 0) { // Look for opening or closing div/nav tags if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '', i); if (closeIdx === -1) break; i = closeIdx + 1; } else { // Opening tag - find the end (handle attributes and self-closing) const closeIdx = html.indexOf('>', i); if (closeIdx === -1) break; // Check if it's self-closing (look for /> before the >) const tagContent = html.substring(i, closeIdx); if (!tagContent.endsWith('/')) { depth++; } i = closeIdx + 1; } } else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '', i); if (closeIdx === -1) break; i = closeIdx + 1; } else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '', i); if (closeIdx === -1) break; i = closeIdx + 1; } else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '', i); if (closeIdx === -1) break; const tagContent = html.substring(i, closeIdx); if (!tagContent.endsWith('/')) { depth++; } i = closeIdx + 1; } else { i++; } } if (depth === 0) { // Found the matching closing tag const tocEndIdx = i; // Extract the TOC content (inner HTML) const tocFullHTML = html.substring(tocStartIdx, tocEndIdx); // Extract just the inner content (without the outer div tags) let innerStart = tocStartTag.length; let innerEnd = tocFullHTML.length; // Find the last or if (tocFullHTML.endsWith('')) { innerEnd -= 6; } else if (tocFullHTML.endsWith('')) { innerEnd -= 7; } tocContent = tocFullHTML.substring(innerStart, innerEnd).trim(); // Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title) tocContent = tocContent.replace(/]*>.*?<\/div>\s*/gis, ''); tocContent = tocContent.trim(); // Remove the TOC from the content contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx); } // Extract just the body content if the HTML includes full document structure // AsciiDoctor might return full HTML with , , tags // Check if this is a full HTML document const isFullDocument = /^\s* tag const bodyStartMatch = contentWithoutTOC.match(/]*>/i); if (bodyStartMatch && bodyStartMatch.index !== undefined) { const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length; // Find the closing tag by searching backwards from the end // This is more reliable than regex for nested content const bodyEndMatch = contentWithoutTOC.lastIndexOf(''); if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) { contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim(); } } } // Remove any remaining document structure tags that might have slipped through contentWithoutTOC = contentWithoutTOC .replace(/]*>/gi, '') .replace(/<\/html>/gi, '') .replace(/]*>[\s\S]*?<\/head>/gi, '') .replace(/]*>/gi, '') .replace(/<\/body>/gi, ''); // Clean up any extra whitespace contentWithoutTOC = contentWithoutTOC.trim(); return { toc: tocContent, contentWithoutTOC }; } /** * Performs basic HTML sanitization to prevent XSS */ export function sanitizeHTML(html: string): string { // Remove script tags and their content html = html.replace(/]*>.*?<\/script>/gis, ''); // Remove event handlers (onclick, onerror, etc.) html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, ''); // Remove javascript: protocol in links html = html.replace(/javascript:/gi, ''); // Remove data: URLs that could be dangerous html = html.replace(/data:\s*text\/html/gi, ''); return html; } /** * Processes HTML links to add target="_blank" to external links * This function is available for use but not currently called automatically. * It can be used in post-processing if needed. */ export function processLinks(html: string, linkBaseURL: string): string { // Extract domain from linkBaseURL for comparison let linkBaseDomain = ''; if (linkBaseURL) { try { // Use URL constructor if available (Node.js 10+) // eslint-disable-next-line @typescript-eslint/no-explicit-any const URLConstructor = (globalThis as any).URL; if (URLConstructor) { const url = new URLConstructor(linkBaseURL); linkBaseDomain = url.hostname; } else { throw new Error('URL not available'); } } catch { // Fallback to simple string parsing if URL constructor fails const url = linkBaseURL.replace(/^https?:\/\//, ''); const parts = url.split('/'); if (parts.length > 0) { linkBaseDomain = parts[0]; } } } // Regex to match tags with href attributes const linkRegex = /]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g; return html.replace(linkRegex, (match, before, href, after) => { // Check if it's an external link (starts with http:// or https://) const isExternal = href.startsWith('http://') || href.startsWith('https://'); if (isExternal) { // Check if it's pointing to our own domain if (linkBaseDomain) { try { // eslint-disable-next-line @typescript-eslint/no-explicit-any const URLConstructor = (globalThis as any).URL; if (URLConstructor) { const hrefUrl = new URLConstructor(href); if (hrefUrl.hostname === linkBaseDomain) { // Same domain - open in same tab (remove any existing target attribute) return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); } } else { throw new Error('URL not available'); } } catch { // If URL parsing fails, use simple string check if (href.includes(linkBaseDomain)) { return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); } } } // External link - add target="_blank" and rel="noopener noreferrer" if not already present if (!match.includes('target=')) { if (!match.includes('rel=')) { return match.replace('>', ' target="_blank" rel="noopener noreferrer">'); } else { // Update existing rel attribute to include noopener if not present const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => { if (!relValue.includes('noopener')) { return `rel="${relValue} noopener noreferrer"`; } return relMatch; }); return updatedMatch.replace('>', ' target="_blank">'); } } } else { // Local/relative link - ensure it opens in same tab (remove target if present) return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); } return match; }); }