gitcitadel-online/internal/asciidoc/processor.go

package asciidoc

import (
	"bytes"
	"fmt"
	"os/exec"
	"regexp"
	"strings"
)

// Processor handles AsciiDoc to HTML conversion
type Processor struct {
	linkBaseURL string
}

// NewProcessor creates a new AsciiDoc processor
func NewProcessor(linkBaseURL string) *Processor {
	return &Processor{
		linkBaseURL: linkBaseURL,
	}
}

// Process converts AsciiDoc content to HTML with link rewriting
func (p *Processor) Process(asciidocContent string) (string, error) {
	// First, rewrite links in the AsciiDoc content
	processedContent := p.rewriteLinks(asciidocContent)

	// Convert AsciiDoc to HTML using asciidoctor CLI
	html, err := p.convertToHTML(processedContent)
	if err != nil {
		return "", fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err)
	}

	// Sanitize HTML to prevent XSS
	sanitized := p.sanitizeHTML(html)

	// Process links: make external links open in new tab, local links in same tab
	processed := p.processLinks(sanitized)

	return processed, nil
}

// rewriteLinks rewrites wikilinks and nostr: links in AsciiDoc content
func (p *Processor) rewriteLinks(content string) string {
	// Rewrite wikilinks: [[target]] or [[target|display text]]
	// Format: [[target]] -> https://alexandria.gitcitadel.eu/events?d=<normalized-d-tag>
	wikilinkRegex := regexp.MustCompile(`\[\[([^\]]+)\]\]`)
	content = wikilinkRegex.ReplaceAllStringFunc(content, func(match string) string {
		// Extract the content inside [[ ]]
		inner := match[2 : len(match)-2]

		var target, display string
		if strings.Contains(inner, "|") {
			parts := strings.SplitN(inner, "|", 2)
			target = strings.TrimSpace(parts[0])
			display = strings.TrimSpace(parts[1])
		} else {
			target = strings.TrimSpace(inner)
			display = target
		}

		// Normalize the d tag (convert to lowercase, replace spaces with hyphens, etc.)
		normalized := normalizeDTag(target)

		// Create the link
		url := fmt.Sprintf("%s/events?d=%s", p.linkBaseURL, normalized)
		return fmt.Sprintf("link:%s[%s]", url, display)
	})

	// Rewrite nostr: links: nostr:naddr1... or nostr:nevent1...
	// Format: nostr:naddr1... -> https://alexandria.gitcitadel.eu/events?id=naddr1...
	nostrLinkRegex := regexp.MustCompile(`nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+)`)
	content = nostrLinkRegex.ReplaceAllStringFunc(content, func(match string) string {
		nostrID := strings.TrimPrefix(match, "nostr:")
		url := fmt.Sprintf("%s/events?id=%s", p.linkBaseURL, nostrID)
		return url
	})

	return content
}

// normalizeDTag normalizes a d tag according to NIP-54 rules
func normalizeDTag(dTag string) string {
	// Convert to lowercase
	dTag = strings.ToLower(dTag)

	// Convert whitespace to hyphens
	dTag = strings.ReplaceAll(dTag, " ", "-")
	dTag = strings.ReplaceAll(dTag, "\t", "-")
	dTag = strings.ReplaceAll(dTag, "\n", "-")

	// Remove punctuation and symbols (keep alphanumeric, hyphens, and non-ASCII)
	var result strings.Builder
	for _, r := range dTag {
		if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r > 127 {
			result.WriteRune(r)
		}
	}
	dTag = result.String()

	// Collapse multiple consecutive hyphens
	for strings.Contains(dTag, "--") {
		dTag = strings.ReplaceAll(dTag, "--", "-")
	}

	// Remove leading and trailing hyphens
	dTag = strings.Trim(dTag, "-")

	return dTag
}

// convertToHTML converts AsciiDoc to HTML using asciidoctor.js via Node.js
func (p *Processor) convertToHTML(asciidocContent string) (string, error) {
	// Check if node is available
	cmd := exec.Command("node", "--version")
	if err := cmd.Run(); err != nil {
		return "", fmt.Errorf("node.js not found: %w", err)
	}

	// JavaScript code to run asciidoctor.js
	// Read content from stdin to handle special characters properly
	jsCode := `
		const asciidoctor = require('@asciidoctor/core')();

		let content = '';
		process.stdin.setEncoding('utf8');

		process.stdin.on('data', (chunk) => {
			content += chunk;
		});

		process.stdin.on('end', () => {
			try {
				const html = asciidoctor.convert(content, {
					safe: 'safe',
					backend: 'html5',
					doctype: 'article',
					attributes: {
						'showtitle': true,
						'icons': 'font',
						'sectanchors': true,
						'sectlinks': true,
						'toc': 'left',
						'toclevels': 3
					}
				});
				process.stdout.write(html);
			} catch (error) {
				console.error('Error converting AsciiDoc:', error.message);
				process.exit(1);
			}
		});
	`

	// Run node with the JavaScript code, passing content via stdin
	cmd = exec.Command("node", "-e", jsCode)
	cmd.Stdin = strings.NewReader(asciidocContent)

	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr

	if err := cmd.Run(); err != nil {
		return "", fmt.Errorf("asciidoctor.js conversion failed: %w, stderr: %s", err, stderr.String())
	}

	return stdout.String(), nil
}

// sanitizeHTML performs basic HTML sanitization to prevent XSS
// Note: This is a basic implementation. For production, consider using a proper HTML sanitizer library
func (p *Processor) sanitizeHTML(html string) string {
	// Remove script tags and their content
	scriptRegex := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`)
	html = scriptRegex.ReplaceAllString(html, "")

	// Remove event handlers (onclick, onerror, etc.)
	eventHandlerRegex := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`)
	html = eventHandlerRegex.ReplaceAllString(html, "")

	// Remove javascript: protocol in links
	javascriptRegex := regexp.MustCompile(`(?i)javascript:`)
	html = javascriptRegex.ReplaceAllString(html, "")

	// Remove data: URLs that could be dangerous
	dataURLRegex := regexp.MustCompile(`(?i)data:\s*text/html`)
	html = dataURLRegex.ReplaceAllString(html, "")

	return html
}

// processLinks processes HTML links to add target="_blank" to external links
// External links are those that start with http:// or https:// and don't point to the linkBaseURL domain
// Local links (including relative links and links to linkBaseURL) open in the same tab
func (p *Processor) processLinks(html string) string {
	// Extract domain from linkBaseURL for comparison
	linkBaseDomain := ""
	if strings.HasPrefix(p.linkBaseURL, "http://") || strings.HasPrefix(p.linkBaseURL, "https://") {
		// Extract domain (e.g., "alexandria.gitcitadel.eu" from "https://alexandria.gitcitadel.eu")
		parts := strings.Split(strings.TrimPrefix(strings.TrimPrefix(p.linkBaseURL, "https://"), "http://"), "/")
		if len(parts) > 0 {
			linkBaseDomain = parts[0]
		}
	}

	// Regex to match <a> tags with href attributes (more flexible pattern)
	linkRegex := regexp.MustCompile(`<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>`)

	html = linkRegex.ReplaceAllStringFunc(html, func(match string) string {
		// Extract href value
		hrefMatch := regexp.MustCompile(`href\s*=\s*["']([^"']+)["']`)
		hrefSubmatch := hrefMatch.FindStringSubmatch(match)
		if len(hrefSubmatch) < 2 {
			return match // No href found, return as-is
		}
		href := hrefSubmatch[1]

		// Check if it's an external link (starts with http:// or https://)
		isExternal := strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://")

		if isExternal {
			// Check if it's pointing to our own domain
			if linkBaseDomain != "" && strings.Contains(href, linkBaseDomain) {
				// Same domain - open in same tab (remove any existing target attribute)
				targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
				match = targetRegex.ReplaceAllString(match, "")
				return match
			}

			// External link - add target="_blank" and rel="noopener noreferrer" if not already present
			if !strings.Contains(match, `target=`) {
				// Insert before the closing >
				match = strings.TrimSuffix(match, ">")
				if !strings.Contains(match, `rel=`) {
					match += ` target="_blank" rel="noopener noreferrer">`
				} else {
					// Update existing rel attribute to include noopener if not present
					relRegex := regexp.MustCompile(`rel\s*=\s*["']([^"']*)["']`)
					match = relRegex.ReplaceAllStringFunc(match, func(relMatch string) string {
						relValue := relRegex.FindStringSubmatch(relMatch)[1]
						if !strings.Contains(relValue, "noopener") {
							relValue += " noopener noreferrer"
						}
						return `rel="` + strings.TrimSpace(relValue) + `"`
					})
					match += ` target="_blank">`
				}
			}
		} else {
			// Local/relative link - ensure it opens in same tab (remove target if present)
			targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
			match = targetRegex.ReplaceAllString(match, "")
		}

		return match
	})

	return html
}