gitcitadel-online/internal/asciidoc/processor.go

package asciidoc

import (
	"bytes"
	"fmt"
	"os/exec"
	"regexp"
	"strings"
)

// Processor handles AsciiDoc to HTML conversion
type Processor struct {
	linkBaseURL string
}

// ProcessResult contains the processed HTML content and extracted table of contents
type ProcessResult struct {
	Content         string
	TableOfContents string
}

// NewProcessor creates a new AsciiDoc processor
func NewProcessor(linkBaseURL string) *Processor {
	return &Processor{
		linkBaseURL: linkBaseURL,
	}
}

// Process converts AsciiDoc content to HTML with link rewriting
// Returns both the content HTML and the extracted table of contents
func (p *Processor) Process(asciidocContent string) (*ProcessResult, error) {
	// First, rewrite links in the AsciiDoc content
	processedContent := p.rewriteLinks(asciidocContent)

	// Convert AsciiDoc to HTML using asciidoctor CLI
	html, err := p.convertToHTML(processedContent)
	if err != nil {
		return nil, fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err)
	}

	// Extract table of contents from HTML
	toc, contentWithoutTOC := p.extractTOC(html)

	// Sanitize HTML to prevent XSS
	sanitized := p.sanitizeHTML(contentWithoutTOC)

	// Process links: make external links open in new tab, local links in same tab
	processed := p.processLinks(sanitized)

	// Also sanitize and process links in TOC
	tocSanitized := p.sanitizeHTML(toc)
	tocProcessed := p.processLinks(tocSanitized)

	return &ProcessResult{
		Content:         processed,
		TableOfContents: tocProcessed,
	}, nil
}

// rewriteLinks rewrites wikilinks and nostr: links in AsciiDoc content
func (p *Processor) rewriteLinks(content string) string {
	// Rewrite wikilinks: [[target]] or [[target|display text]]
	// Format: [[target]] -> https://alexandria.gitcitadel.eu/events?d=<normalized-d-tag>
	wikilinkRegex := regexp.MustCompile(`\[\[([^\]]+)\]\]`)
	content = wikilinkRegex.ReplaceAllStringFunc(content, func(match string) string {
		// Extract the content inside [[ ]]
		inner := match[2 : len(match)-2]

		var target, display string
		if strings.Contains(inner, "|") {
			parts := strings.SplitN(inner, "|", 2)
			target = strings.TrimSpace(parts[0])
			display = strings.TrimSpace(parts[1])
		} else {
			target = strings.TrimSpace(inner)
			display = target
		}

		// Normalize the d tag (convert to lowercase, replace spaces with hyphens, etc.)
		normalized := normalizeDTag(target)

		// Create the link
		url := fmt.Sprintf("%s/events?d=%s", p.linkBaseURL, normalized)
		return fmt.Sprintf("link:%s[%s]", url, display)
	})

	// Rewrite nostr: links: nostr:naddr1... or nostr:nevent1...
	// Format: nostr:naddr1... -> https://alexandria.gitcitadel.eu/events?id=naddr1...
	nostrLinkRegex := regexp.MustCompile(`nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+)`)
	content = nostrLinkRegex.ReplaceAllStringFunc(content, func(match string) string {
		nostrID := strings.TrimPrefix(match, "nostr:")
		url := fmt.Sprintf("%s/events?id=%s", p.linkBaseURL, nostrID)
		return url
	})

	return content
}

// normalizeDTag normalizes a d tag according to NIP-54 rules
func normalizeDTag(dTag string) string {
	// Convert to lowercase
	dTag = strings.ToLower(dTag)

	// Convert whitespace to hyphens
	dTag = strings.ReplaceAll(dTag, " ", "-")
	dTag = strings.ReplaceAll(dTag, "\t", "-")
	dTag = strings.ReplaceAll(dTag, "\n", "-")

	// Remove punctuation and symbols (keep alphanumeric, hyphens, and non-ASCII)
	var result strings.Builder
	for _, r := range dTag {
		if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r > 127 {
			result.WriteRune(r)
		}
	}
	dTag = result.String()

	// Collapse multiple consecutive hyphens
	for strings.Contains(dTag, "--") {
		dTag = strings.ReplaceAll(dTag, "--", "-")
	}

	// Remove leading and trailing hyphens
	dTag = strings.Trim(dTag, "-")

	return dTag
}

// convertToHTML converts AsciiDoc to HTML using asciidoctor.js via Node.js
func (p *Processor) convertToHTML(asciidocContent string) (string, error) {
	// Check if node is available
	cmd := exec.Command("node", "--version")
	if err := cmd.Run(); err != nil {
		return "", fmt.Errorf("node.js not found: %w", err)
	}

	// JavaScript code to run asciidoctor.js
	// Read content from stdin to handle special characters properly
	jsCode := `
		const asciidoctor = require('@asciidoctor/core')();

		let content = '';
		process.stdin.setEncoding('utf8');

		process.stdin.on('data', (chunk) => {
			content += chunk;
		});

		process.stdin.on('end', () => {
			try {
				const html = asciidoctor.convert(content, {
					safe: 'safe',
					backend: 'html5',
					doctype: 'article',
					attributes: {
						'showtitle': true,
						'icons': 'font',
						'sectanchors': true,
						'sectlinks': true,
						'toc': 'left',
						'toclevels': 3
					}
				});
				process.stdout.write(html);
			} catch (error) {
				console.error('Error converting AsciiDoc:', error.message);
				process.exit(1);
			}
		});
	`

	// Run node with the JavaScript code, passing content via stdin
	cmd = exec.Command("node", "-e", jsCode)
	cmd.Stdin = strings.NewReader(asciidocContent)

	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr

	if err := cmd.Run(); err != nil {
		return "", fmt.Errorf("asciidoctor.js conversion failed: %w, stderr: %s", err, stderr.String())
	}

	return stdout.String(), nil
}

// sanitizeHTML performs basic HTML sanitization to prevent XSS
// Note: This is a basic implementation. For production, consider using a proper HTML sanitizer library
func (p *Processor) sanitizeHTML(html string) string {
	// Remove script tags and their content
	scriptRegex := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`)
	html = scriptRegex.ReplaceAllString(html, "")

	// Remove event handlers (onclick, onerror, etc.)
	eventHandlerRegex := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`)
	html = eventHandlerRegex.ReplaceAllString(html, "")

	// Remove javascript: protocol in links
	javascriptRegex := regexp.MustCompile(`(?i)javascript:`)
	html = javascriptRegex.ReplaceAllString(html, "")

	// Remove data: URLs that could be dangerous
	dataURLRegex := regexp.MustCompile(`(?i)data:\s*text/html`)
	html = dataURLRegex.ReplaceAllString(html, "")

	return html
}

// extractTOC extracts the table of contents from AsciiDoc HTML output
// Returns the TOC HTML and the content HTML without the TOC
func (p *Processor) extractTOC(html string) (string, string) {
	// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
	// We need to match the entire TOC div including nested content
	// Since divs can be nested, we need to count opening/closing tags

	var tocContent string
	contentWithoutTOC := html

	// Find the start of the TOC div - try multiple patterns
	tocStartPatterns := []*regexp.Regexp{
		// Pattern 1: <div id="toc" class="toc">
		regexp.MustCompile(`(?i)<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>`),
		// Pattern 2: <div id="toc">
		regexp.MustCompile(`(?i)<div\s+id=["']toc["'][^>]*>`),
		// Pattern 3: <div class="toc">
		regexp.MustCompile(`(?i)<div\s+class=["']toc["'][^>]*>`),
		// Pattern 4: <nav id="toc">
		regexp.MustCompile(`(?i)<nav\s+id=["']toc["'][^>]*>`),
	}

	var tocStartIdx int = -1
	var tocStartTag string

	for _, pattern := range tocStartPatterns {
		loc := pattern.FindStringIndex(html)
		if loc != nil {
			tocStartIdx = loc[0]
			tocStartTag = html[loc[0]:loc[1]]
			break
		}
	}

	if tocStartIdx == -1 {
		// No TOC found
		return "", html
	}

	// Find the matching closing tag by counting div tags
	// Start after the opening tag
	searchStart := tocStartIdx + len(tocStartTag)
	depth := 1
	i := searchStart

	for i < len(html) && depth > 0 {
		// Look for opening or closing div/nav tags
		if i+4 < len(html) && html[i:i+4] == "<div" {
			// Check if it's a closing tag
			if i+5 < len(html) && html[i+4] == '/' {
				depth--
				// Find the end of this closing tag
				closeIdx := strings.Index(html[i:], ">")
				if closeIdx == -1 {
					break
				}
				i += closeIdx + 1
			} else {
				// Opening tag - find the end
				closeIdx := strings.Index(html[i:], ">")
				if closeIdx == -1 {
					break
				}
				// Check if it's self-closing
				if html[i+closeIdx-1] != '/' {
					depth++
				}
				i += closeIdx + 1
			}
		} else if i+5 < len(html) && html[i:i+5] == "</div" {
			depth--
			closeIdx := strings.Index(html[i:], ">")
			if closeIdx == -1 {
				break
			}
			i += closeIdx + 1
		} else if i+5 < len(html) && html[i:i+5] == "</nav" {
			depth--
			closeIdx := strings.Index(html[i:], ">")
			if closeIdx == -1 {
				break
			}
			i += closeIdx + 1
		} else {
			i++
		}
	}

	if depth == 0 {
		// Found the matching closing tag
		tocEndIdx := i
		// Extract the TOC content (inner HTML)
		tocFullHTML := html[tocStartIdx:tocEndIdx]
		// Extract just the inner content (without the outer div tags)
		innerStart := len(tocStartTag)
		innerEnd := len(tocFullHTML)
		// Find the last </div> or </nav>
		if strings.HasSuffix(tocFullHTML, "</div>") {
			innerEnd -= 6
		} else if strings.HasSuffix(tocFullHTML, "</nav>") {
			innerEnd -= 7
		}
		tocContent = strings.TrimSpace(tocFullHTML[innerStart:innerEnd])

		// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
		toctitlePattern := regexp.MustCompile(`(?s)<div\s+id=["']toctitle["'][^>]*>.*?</div>\s*`)
		tocContent = toctitlePattern.ReplaceAllString(tocContent, "")
		tocContent = strings.TrimSpace(tocContent)

		// Remove the TOC from the content
		contentWithoutTOC = html[:tocStartIdx] + html[tocEndIdx:]
	}

	return tocContent, contentWithoutTOC
}

// processLinks processes HTML links to add target="_blank" to external links
// External links are those that start with http:// or https:// and don't point to the linkBaseURL domain
// Local links (including relative links and links to linkBaseURL) open in the same tab
func (p *Processor) processLinks(html string) string {
	// Extract domain from linkBaseURL for comparison
	linkBaseDomain := ""
	if strings.HasPrefix(p.linkBaseURL, "http://") || strings.HasPrefix(p.linkBaseURL, "https://") {
		// Extract domain (e.g., "alexandria.gitcitadel.eu" from "https://alexandria.gitcitadel.eu")
		parts := strings.Split(strings.TrimPrefix(strings.TrimPrefix(p.linkBaseURL, "https://"), "http://"), "/")
		if len(parts) > 0 {
			linkBaseDomain = parts[0]
		}
	}

	// Regex to match <a> tags with href attributes (more flexible pattern)
	linkRegex := regexp.MustCompile(`<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>`)

	html = linkRegex.ReplaceAllStringFunc(html, func(match string) string {
		// Extract href value
		hrefMatch := regexp.MustCompile(`href\s*=\s*["']([^"']+)["']`)
		hrefSubmatch := hrefMatch.FindStringSubmatch(match)
		if len(hrefSubmatch) < 2 {
			return match // No href found, return as-is
		}
		href := hrefSubmatch[1]

		// Check if it's an external link (starts with http:// or https://)
		isExternal := strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://")

		if isExternal {
			// Check if it's pointing to our own domain
			if linkBaseDomain != "" && strings.Contains(href, linkBaseDomain) {
				// Same domain - open in same tab (remove any existing target attribute)
				targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
				match = targetRegex.ReplaceAllString(match, "")
				return match
			}

			// External link - add target="_blank" and rel="noopener noreferrer" if not already present
			if !strings.Contains(match, `target=`) {
				// Insert before the closing >
				match = strings.TrimSuffix(match, ">")
				if !strings.Contains(match, `rel=`) {
					match += ` target="_blank" rel="noopener noreferrer">`
				} else {
					// Update existing rel attribute to include noopener if not present
					relRegex := regexp.MustCompile(`rel\s*=\s*["']([^"']*)["']`)
					match = relRegex.ReplaceAllStringFunc(match, func(relMatch string) string {
						relValue := relRegex.FindStringSubmatch(relMatch)[1]
						if !strings.Contains(relValue, "noopener") {
							relValue += " noopener noreferrer"
						}
						return `rel="` + strings.TrimSpace(relValue) + `"`
					})
					match += ` target="_blank">`
				}
			}
		} else {
			// Local/relative link - ensure it opens in same tab (remove target if present)
			targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
			match = targetRegex.ReplaceAllString(match, "")
		}

		return match
	})

	return html
}