gc-parser/src/processors/html-postprocess.js

"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.postProcessHtml = postProcessHtml;
const music_1 = require("./music");
/**
 * Post-processes HTML output from AsciiDoctor
 * Converts AsciiDoc macros to HTML with data attributes and CSS classes
 */
function postProcessHtml(html, options = {}) {
    let processed = html;
    // Convert bookstr markers to HTML placeholders
    processed = processed.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => {
        const escaped = bookContent.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
        return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`;
    });
    // Convert hashtag links to HTML
    processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
        // HTML escape the display text
        const escapedDisplay = displayText
            .replace(/&/g, '&amp;')
            .replace(/</g, '&lt;')
            .replace(/>/g, '&gt;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        // If hashtagUrl is configured, make it a clickable link
        if (options.hashtagUrl) {
            let url;
            if (typeof options.hashtagUrl === 'function') {
                url = options.hashtagUrl(normalizedHashtag);
            }
            else {
                // String template with {topic} placeholder
                url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag);
            }
            // Escape URL for HTML attribute
            const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
            return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${normalizedHashtag.replace(/"/g, '&quot;')}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
        }
        else {
            // Default: Use span instead of <a> tag - same color as links but no underline and not clickable
            return `<span class="hashtag-link">${escapedDisplay}</span>`;
        }
    });
    // Convert WIKILINK:dtag|display placeholder format to HTML
    // Match WIKILINK:dtag|display, ensuring we don't match across HTML tags
    processed = processed.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => {
        const escapedDtag = dTag.trim().replace(/"/g, '&quot;');
        const escapedDisplay = displayText.trim()
            .replace(/&/g, '&amp;')
            .replace(/</g, '&lt;')
            .replace(/>/g, '&gt;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        // Generate URL using custom format or default
        let url;
        if (options.wikilinkUrl) {
            if (typeof options.wikilinkUrl === 'function') {
                url = options.wikilinkUrl(dTag.trim());
            }
            else {
                // String template with {dtag} placeholder
                url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim());
            }
        }
        else {
            // Default format
            url = `/events?d=${escapedDtag}`;
        }
        // Escape URL for HTML attribute
        const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
        return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
    });
    // Convert any leftover link: macros that AsciiDoctor didn't convert
    // This MUST run before processOpenGraphLinks which removes "link:" prefixes
    // This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars)
    // Pattern: link:url[text] where url is http/https and text can contain any characters
    // Match link: macros that are still in the HTML as plain text (not converted by AsciiDoctor)
    // Also handle HTML-escaped versions that might appear
    processed = processed.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => {
        // Unescape if already HTML-escaped (but be careful not to unescape actual content)
        let unescapedUrl = url;
        // Only unescape if it looks like it was escaped (contains &amp; or &quot;)
        if (url.includes('&amp;') || url.includes('&quot;') || url.includes('&#39;')) {
            unescapedUrl = url
                .replace(/&amp;/g, '&')
                .replace(/&lt;/g, '<')
                .replace(/&gt;/g, '>')
                .replace(/&quot;/g, '"')
                .replace(/&#39;/g, "'");
        }
        let unescapedText = text;
        // Only unescape if it looks like it was escaped
        if (text.includes('&amp;') || text.includes('&lt;') || text.includes('&gt;') || text.includes('&quot;') || text.includes('&#39;')) {
            unescapedText = text
                .replace(/&amp;/g, '&')
                .replace(/&lt;/g, '<')
                .replace(/&gt;/g, '>')
                .replace(/&quot;/g, '"')
                .replace(/&#39;/g, "'");
        }
        // Escape URL for HTML attribute (fresh escape, no double-escaping)
        const escapedUrl = unescapedUrl
            .replace(/&/g, '&amp;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        // Escape text content for HTML (fresh escape, no double-escaping)
        const escapedText = unescapedText
            .replace(/&/g, '&amp;')
            .replace(/</g, '&lt;')
            .replace(/>/g, '&gt;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        // Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
        const isRelayUrl = /wss?:\/\//i.test(unescapedText);
        if (isRelayUrl) {
            // Simple link without OpenGraph wrapper
            return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
        }
        else {
            // Regular link - will be processed by OpenGraph handler if external
            return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
        }
    });
    // Convert nostr: links to HTML
    processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => {
        const nostrType = getNostrType(bech32Id);
        if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') {
            // Render as embedded event placeholder
            const escaped = bech32Id.replace(/"/g, '&quot;');
            return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`;
        }
        else if (nostrType === 'npub' || nostrType === 'nprofile') {
            // Render as user handle
            const escaped = bech32Id.replace(/"/g, '&quot;');
            return `<span class="user-handle" data-pubkey="${escaped}">@${displayText}</span>`;
        }
        else {
            // Fallback to regular link
            const escaped = bech32Id.replace(/"/g, '&quot;');
            return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${displayText}</a>`;
        }
    });
    // Process media URLs (YouTube, Spotify, video, audio)
    processed = processMedia(processed);
    // Fix double-escaped quotes in href attributes FIRST (before any other processing)
    // This fixes href="&quot;url&quot;" -> href="url"
    processed = processed.replace(/href\s*=\s*["']&quot;(https?:\/\/[^"']+)&quot;["']/gi, (_match, url) => {
        const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
        return `href="${escapedUrl}"`;
    });
    // Process OpenGraph links (external links that should have rich previews)
    processed = processOpenGraphLinks(processed, options.linkBaseURL);
    // Process images: add max-width styling and data attributes
    processed = processImages(processed);
    // Process musical notation if enabled
    if (options.enableMusicalNotation) {
        processed = (0, music_1.processMusicalNotation)(processed);
    }
    // Clean up any escaped HTML that appears as text (e.g., &lt;a href=...&gt;)
    // This can happen when AsciiDoctor escapes link macros that it couldn't parse
    // Pattern: &lt;a href="url"&gt;text&lt;/a&gt; should be converted to actual HTML
    // Use a more flexible pattern that handles text with special characters like ://
    // Fix regular escaped HTML links
    processed = processed.replace(/&lt;a\s+href=["'](https?:\/\/[^"']+)["']\s*&gt;([^<]+)&lt;\/a&gt;/gi, (_match, url, text) => {
        // Unescape the URL and text
        const unescapedUrl = url
            .replace(/&amp;/g, '&')
            .replace(/&quot;/g, '"')
            .replace(/&#39;/g, "'");
        const unescapedText = text
            .replace(/&amp;/g, '&')
            .replace(/&lt;/g, '<')
            .replace(/&gt;/g, '>');
        // Re-escape properly for HTML
        const escapedUrl = unescapedUrl
            .replace(/&/g, '&amp;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        const escapedText = unescapedText
            .replace(/&/g, '&amp;')
            .replace(/</g, '&lt;')
            .replace(/>/g, '&gt;');
        // Check if link text contains wss:// or ws:// - these are relay URLs
        const isRelayUrl = /wss?:\/\//i.test(unescapedText);
        if (isRelayUrl) {
            // Simple link without OpenGraph wrapper
            return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
        }
        else {
            // Regular link
            return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
        }
    });
    // Clean up any leftover markdown syntax
    processed = cleanupMarkdown(processed);
    // Add styling classes
    processed = addStylingClasses(processed);
    // Hide raw ToC text
    processed = hideRawTocText(processed);
    return processed;
}
/**
 * Get Nostr identifier type
 */
function getNostrType(id) {
    if (id.startsWith('npub'))
        return 'npub';
    if (id.startsWith('nprofile'))
        return 'nprofile';
    if (id.startsWith('nevent'))
        return 'nevent';
    if (id.startsWith('naddr'))
        return 'naddr';
    if (id.startsWith('note'))
        return 'note';
    return null;
}
/**
 * Process media URLs (YouTube, Spotify, video, audio)
 * Converts MEDIA: placeholders to HTML embeds/players
 */
function processMedia(html) {
    let processed = html;
    // Process YouTube embeds
    processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => {
        const escapedId = videoId.replace(/"/g, '&quot;');
        return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
      <iframe
        style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
        src="https://www.youtube.com/embed/${escapedId}"
        frameborder="0"
        allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
        allowfullscreen
        loading="lazy">
      </iframe>
    </div>`;
    });
    // Process Spotify embeds
    processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => {
        const escapedType = type.replace(/"/g, '&quot;');
        const escapedId = id.replace(/"/g, '&quot;');
        return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
      <iframe
        style="border-radius: 12px; width: 100%; max-width: 100%;"
        src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
        width="100%"
        height="352"
        frameborder="0"
        allowfullscreen=""
        allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
        loading="lazy">
      </iframe>
    </div>`;
    });
    // Process video files
    processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
        const escapedUrl = url
            .replace(/&/g, '&amp;')
            .replace(/</g, '&lt;')
            .replace(/>/g, '&gt;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        return `<div class="media-embed video-embed" style="margin: 1rem 0;">
      <video
        controls
        preload="metadata"
        style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;"
        class="media-player">
        <source src="${escapedUrl}" type="video/mp4">
        Your browser does not support the video tag.
      </video>
    </div>`;
    });
    // Process audio files
    processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
        const escapedUrl = url
            .replace(/&/g, '&amp;')
            .replace(/</g, '&lt;')
            .replace(/>/g, '&gt;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
      <audio
        controls
        preload="metadata"
        style="width: 100%; max-width: 100%;"
        class="media-player">
        <source src="${escapedUrl}">
        Your browser does not support the audio tag.
      </audio>
    </div>`;
    });
    return processed;
}
/**
 * Process OpenGraph links - mark external links for OpenGraph preview fetching
 */
function processOpenGraphLinks(html, linkBaseURL) {
    // First, clean up any corrupted HTML fragments that might interfere
    // Remove "link:" prefixes that appear before links (AsciiDoc syntax that shouldn't be in HTML)
    // This happens when AsciiDoctor doesn't fully convert link:url[text] syntax or when
    // there's literal text like "should render like link:" before an anchor tag
    let processed = html;
    // Remove "link:" that appears immediately before anchor tags (most common case)
    // Match "link:" followed by optional whitespace and then <a
    processed = processed.replace(/link:\s*<a/gi, '<a');
    // Remove "link:" that appears as plain text in HTML (shouldn't be there)
    // Be careful not to match "link:" inside HTML attributes or tags
    // Match "link:" that's not inside quotes or tags
    processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2');
    // Also handle cases where "link:" appears with whitespace before anchor tags
    processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' ');
    // Clean up any corrupted href attributes that contain HTML fragments or double-escaped quotes
    // Fix href attributes with escaped quotes: href="&quot;url&quot;" -> href="url"
    processed = processed.replace(/href\s*=\s*["']&quot;(https?:\/\/[^"']+)&quot;["']/gi, (match, url) => {
        // Extract the clean URL and properly escape it
        const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
        return `href="${escapedUrl}"`;
    });
    // Clean up href attributes that contain HTML fragments
    processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => {
        // If href contains HTML tags, extract just the URL part
        const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i);
        if (urlMatch) {
            const escapedUrl = urlMatch[1].replace(/"/g, '&quot;').replace(/'/g, '&#39;');
            return `href="${escapedUrl}"`;
        }
        return match; // If we can't fix it, leave it (will be skipped by validation)
    });
    // Clean up any malformed anchor tag fragments that might cause issues
    processed = processed.replace(/<a\s+href=["']([^"'>]*<[^"'>]*)["']/gi, (match, corruptedHref) => {
        // Skip corrupted anchor tags - they'll be handled by the main regex with validation
        return match;
    });
    // Clean up links inside code blocks - AsciiDoctor creates them but they should be plain text
    // Remove <a> tags inside <code> blocks, keeping only the link text
    processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match, content) => {
        // Remove any <a> tags inside code blocks, keeping only the text content
        const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
        return `<code>${cleaned}</code>`;
    });
    // Also clean up links inside pre blocks
    processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match, content) => {
        const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
        return `<pre>${cleaned}</pre>`;
    });
    // Now protect code blocks and pre blocks by replacing them with placeholders
    const codeBlockPlaceholders = [];
    const preBlockPlaceholders = [];
    // Replace pre blocks first (they can contain code blocks)
    processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => {
        const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`;
        preBlockPlaceholders.push(match);
        return placeholder;
    });
    // Replace code blocks
    processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => {
        const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
        codeBlockPlaceholders.push(match);
        return placeholder;
    });
    // Extract base domain from linkBaseURL if provided
    let baseDomain = null;
    if (linkBaseURL) {
        try {
            const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
            if (urlMatch) {
                baseDomain = urlMatch[1];
            }
        }
        catch {
            // Ignore parsing errors
        }
    }
    // Before processing, remove any corrupted opengraph containers that might have been created
    // These have malformed data-og-url attributes containing HTML fragments
    // Match all spans with data-og-url and check if they're corrupted
    // Use a pattern that matches spans with data-og-url, then check the attribute value
    processed = processed.replace(/<span[^>]*data-og-url=["']([^"']+)["'][^>]*>[\s\S]*?<\/span>/gi, (match) => {
        // This span has a corrupted data-og-url (contains <)
        // Extract the clean URL from the beginning of the attribute value
        const dataOgUrlMatch = match.match(/data-og-url=["']([^"']+)["']/i);
        if (dataOgUrlMatch && dataOgUrlMatch[1]) {
            // Extract just the URL part (everything before the first <)
            const urlMatch = dataOgUrlMatch[1].match(/(https?:\/\/[^\s<>"']+)/i);
            if (urlMatch) {
                const cleanUrl = urlMatch[1];
                // Extract the link text from inside the span
                const linkMatch = match.match(/<a[^>]*>(.*?)<\/a>/i);
                const linkText = linkMatch ? linkMatch[1] : cleanUrl;
                // Return a clean opengraph container with the fixed URL
                const escapedUrl = cleanUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
                return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
      <a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
      <div class="opengraph-preview" data-og-loading="true" style="display: none;">
        <div class="opengraph-card">
          <div class="opengraph-image-container">
            <img class="opengraph-image" src="" alt="" style="display: none;" />
          </div>
          <div class="opengraph-content">
            <div class="opengraph-site"></div>
            <div class="opengraph-title"></div>
            <div class="opengraph-description"></div>
          </div>
        </div>
      </div>
    </span>`;
            }
            // If we can't extract a clean URL, just remove the corrupted span and keep any text
            const textMatch = match.match(/>([^<]+)</);
            return textMatch ? textMatch[1] : '';
        }
        return match; // Keep valid spans
    });
    // Match external links (http/https) that aren't media, nostr, or wikilinks
    // Skip links that are already in media embeds or special containers
    // Use a stricter regex that only matches valid, complete anchor tags
    // The regex must match a complete <a> tag with proper structure
    processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => {
        // CRITICAL: Validate href FIRST - if it contains ANY HTML tags or fragments, skip immediately
        // This prevents corrupted HTML from being created
        if (!href) {
            return match; // Skip if no href
        }
        // Skip if href contains HTML tags or looks corrupted - be very strict
        // Check for common HTML fragments that indicate corruption
        if (href.includes('<') || href.includes('>') || href.includes('href=') || href.includes('</a>') || href.includes('<a') || href.includes('"') || href.includes("'")) {
            return match; // Skip if href looks corrupted
        }
        // Additional validation: href should only contain URL-safe characters
        // URLs shouldn't contain unescaped quotes or HTML tags
        if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
            return match; // Skip if href doesn't match clean URL pattern
        }
        // Validate href is a proper URL (starts with http:// or https:// and doesn't contain invalid chars)
        if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
            return match; // Skip if href doesn't match URL pattern
        }
        // Skip if the match contains unclosed tags or corrupted HTML
        const openATags = (match.match(/<a\s/g) || []).length;
        const closeATags = (match.match(/<\/a>/g) || []).length;
        if (openATags !== closeATags || openATags !== 1) {
            return match; // Multiple or mismatched <a> tags = corrupted
        }
        // Skip if match contains nested HTML that looks corrupted
        if (match.includes('href="') && match.split('href="').length > 2) {
            return match; // Multiple href attributes = corrupted
        }
        // Skip if it's already a media embed, nostr link, wikilink, or opengraph link
        if (match.includes('class="wikilink"') ||
            match.includes('class="nostr-link"') ||
            match.includes('class="opengraph-link"') ||
            match.includes('data-embedded-note') ||
            match.includes('youtube-embed') ||
            match.includes('spotify-embed') ||
            match.includes('media-embed') ||
            match.includes('opengraph-link-container')) {
            return match;
        }
        // Skip if it's a media file URL
        if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) {
            return match;
        }
        // Skip if it's YouTube or Spotify (already handled as media)
        if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) {
            return match;
        }
        // Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
        // They don't need OpenGraph previews
        if (/wss?:\/\//i.test(linkText)) {
            return match;
        }
        // Check if it's an external link (not same domain)
        let isExternal = true;
        if (baseDomain) {
            try {
                const hrefMatch = href.match(/^https?:\/\/([^\/]+)/);
                if (hrefMatch && hrefMatch[1] === baseDomain) {
                    isExternal = false;
                }
            }
            catch {
                // If parsing fails, assume external
            }
        }
        // Only process external links
        if (!isExternal) {
            return match;
        }
        // Escape the URL for data attribute
        const escapedUrl = href
            .replace(/&/g, '&amp;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        // Add data attribute for OpenGraph fetching and wrap in container
        // The actual OpenGraph fetching will be done client-side via JavaScript
        return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
      <a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
      <div class="opengraph-preview" data-og-loading="true" style="display: none;">
        <div class="opengraph-card">
          <div class="opengraph-image-container">
            <img class="opengraph-image" src="" alt="" style="display: none;" />
          </div>
          <div class="opengraph-content">
            <div class="opengraph-site"></div>
            <div class="opengraph-title"></div>
            <div class="opengraph-description"></div>
          </div>
        </div>
      </div>
    </span>`;
    });
    // Restore code blocks
    codeBlockPlaceholders.forEach((codeBlock, index) => {
        processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock);
    });
    // Restore pre blocks
    preBlockPlaceholders.forEach((preBlock, index) => {
        processed = processed.replace(`__PREBLOCK_${index}__`, preBlock);
    });
    return processed;
}
/**
 * Process images: add max-width styling and data attributes
 */
function processImages(html) {
    const imageUrls = [];
    const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
    let match;
    while ((match = imageUrlRegex.exec(html)) !== null) {
        const url = match[1];
        if (url && !imageUrls.includes(url)) {
            imageUrls.push(url);
        }
    }
    return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => {
        const srcMatch = attributes.match(/src=["']([^"']+)["']/i);
        if (!srcMatch)
            return imgTag;
        const src = srcMatch[1];
        const currentIndex = imageUrls.indexOf(src);
        let updatedAttributes = attributes;
        if (updatedAttributes.match(/class=["']/i)) {
            updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => {
                const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
                const newClasses = cleanedClasses
                    ? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
                    : 'max-w-[400px] object-contain cursor-zoom-in';
                return `class="${newClasses}"`;
            });
        }
        else {
            updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`;
        }
        updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '&quot;')}"`;
        return `<img${updatedAttributes}>`;
    });
}
/**
 * Clean URL by removing tracking parameters
 * Based on jumble's cleanUrl function
 */
function cleanUrl(url) {
    try {
        const parsedUrl = new URL(url);
        // List of tracking parameter prefixes and exact names to remove
        const trackingParams = [
            // Google Analytics & Ads
            'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
            'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
            'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
            // Facebook
            'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
            // Twitter/X
            'twclid', 'twsrc',
            // Microsoft/Bing
            'msclkid', 'mc_cid', 'mc_eid',
            // Adobe
            'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
            // Mailchimp
            'mc_cid', 'mc_eid',
            // HubSpot
            'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
            // Marketo
            'mkt_tok',
            // YouTube
            'si', 'feature', 'kw', 'pp',
            // Other common tracking
            'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
            'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
            // Mobile app tracking
            'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
            // Amazon
            'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
            // Affiliate tracking
            'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
            // Social media share tracking
            'share', 'shared', 'sharesource'
        ];
        // Remove all tracking parameters
        trackingParams.forEach(param => {
            parsedUrl.searchParams.delete(param);
        });
        // Remove any parameter that starts with utm_ or _
        Array.from(parsedUrl.searchParams.keys()).forEach(key => {
            if (key.startsWith('utm_') || key.startsWith('_')) {
                parsedUrl.searchParams.delete(key);
            }
        });
        return parsedUrl.toString();
    }
    catch {
        // If URL parsing fails, return original URL
        return url;
    }
}
/**
 * Clean up leftover markdown syntax
 */
function cleanupMarkdown(html) {
    let cleaned = html;
    // Clean up markdown image syntax
    cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
        const altText = alt || '';
        // Clean URL (remove tracking parameters)
        const cleanedUrl = cleanUrl(url);
        // Escape for HTML attribute
        const escapedUrl = cleanedUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
        return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`;
    });
    // Clean up markdown link syntax
    // Skip if the link is already inside an HTML tag or is part of escaped HTML
    cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
        // Skip if this markdown link is already inside an HTML tag
        // Check if there's an <a> tag nearby that might have been created from this
        if (cleaned.includes(`href="${url}"`) || cleaned.includes(`href='${url}'`)) {
            return _match;
        }
        // Skip if the text contains HTML entities or looks like it's already processed
        if (text.includes('&lt;') || text.includes('&gt;') || text.includes('&amp;')) {
            return _match;
        }
        // Skip if the URL is already in an href attribute (check for escaped versions too)
        const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
        if (cleaned.includes(`href="${escapedUrl}"`) || cleaned.includes(`href='${escapedUrl}'`)) {
            return _match;
        }
        // Clean URL (remove tracking parameters)
        const cleanedUrl = cleanUrl(url);
        // Escape for HTML attribute (but don't double-escape)
        const finalEscapedUrl = cleanedUrl
            .replace(/&amp;/g, '&') // Unescape if already escaped
            .replace(/&/g, '&amp;')
            .replace(/"/g, '&quot;')
            .replace(/'/g, '&#39;');
        // Escape text for HTML (but don't double-escape)
        const escapedText = text
            .replace(/&amp;/g, '&') // Unescape if already escaped
            .replace(/&lt;/g, '<')
            .replace(/&gt;/g, '>')
            .replace(/&/g, '&amp;')
            .replace(/</g, '&lt;')
            .replace(/>/g, '&gt;');
        return `<a href="${finalEscapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
    });
    return cleaned;
}
/**
 * Add proper CSS classes for styling
 */
function addStylingClasses(html) {
    let styled = html;
    // Add strikethrough styling
    styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>');
    // Add subscript styling
    styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>');
    // Add superscript styling
    styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>');
    // Add code highlighting classes
    styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">');
    styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">');
    return styled;
}
/**
 * Hide raw AsciiDoc ToC text
 */
function hideRawTocText(html) {
    let cleaned = html;
    cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, '');
    cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, '');
    cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, '');
    return cleaned;
}