implement links

3 months ago · f46c78f232
4 changed files with 103 additions and 354 deletions
--- a/7
+++ b/7
@ -34,9 +34,16 @@ RUN apk add --no-cache ca-certificates tzdata wget
				@@ -34,9 +34,16 @@ RUN apk add --no-cache ca-certificates tzdata wget
 WORKDIR /app

 # Install Node.js dependencies for AsciiDoc processing
+# Note: gc-parser is referenced from ../gc-parser in package.json
+# Before building, ensure gc-parser is built: cd ../gc-parser && npm install && npm run build
+# Then npm install here will link to the built gc-parser
 COPY package.json package-lock.json ./
 RUN npm ci --only=production

+# Copy gc-parser wrapper script
+COPY scripts/ ./scripts/
+RUN chmod +x ./scripts/process-content.js
+
 # Copy built binary from builder
 COPY --from=builder /build/gitcitadel-online /app/gitcitadel-online

--- a/internal/asciidoc/processor.go
+++ b/internal/asciidoc/processor.go
@ -1,392 +1,90 @@
				@@ -1,392 +1,90 @@
 package asciidoc

 import (
-	"bytes"
+	"encoding/json"
 	"fmt"
 	"os/exec"
-	"regexp"
+	"path/filepath"
 	"strings"
 )

-// Processor handles AsciiDoc to HTML conversion
+// Processor handles content processing using gc-parser
 type Processor struct {
 	linkBaseURL string
+	scriptPath  string
 }

 // ProcessResult contains the processed HTML content and extracted table of contents
 type ProcessResult struct {
 	Content            string
 	TableOfContents    string
+	HasLaTeX           bool
+	HasMusicalNotation bool
 }

-// NewProcessor creates a new AsciiDoc processor
+// gcParserResult matches the JSON output from gc-parser
+type gcParserResult struct {
+	Content            string        `json:"content"`
+	TableOfContents    string        `json:"tableOfContents"`
+	HasLaTeX           bool          `json:"hasLaTeX"`
+	HasMusicalNotation bool          `json:"hasMusicalNotation"`
+	NostrLinks         []interface{} `json:"nostrLinks"`
+	Wikilinks          []interface{} `json:"wikilinks"`
+	Hashtags           []string      `json:"hashtags"`
+	Links              []interface{} `json:"links"`
+	Media              []string      `json:"media"`
+	Error              string        `json:"error,omitempty"`
+}
+
+// NewProcessor creates a new content processor using gc-parser
 func NewProcessor(linkBaseURL string) *Processor {
+	// Determine script path relative to the executable
+	// In production, the script should be in the same directory as the binary
+	scriptPath := filepath.Join("scripts", "process-content.js")
+
 	return &Processor{
 		linkBaseURL: linkBaseURL,
+		scriptPath:  scriptPath,
 	}
 }

-// Process converts AsciiDoc content to HTML with link rewriting
+// Process converts content (AsciiDoc, Markdown, etc.) to HTML using gc-parser
 // Returns both the content HTML and the extracted table of contents
-func (p *Processor) Process(asciidocContent string) (*ProcessResult, error) {
-	// First, rewrite links in the AsciiDoc content
-	processedContent := p.rewriteLinks(asciidocContent)
-
-	// Convert AsciiDoc to HTML using asciidoctor CLI
-	html, err := p.convertToHTML(processedContent)
-	if err != nil {
-		return nil, fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err)
-	}
-
-	// Extract table of contents from HTML
-	toc, contentWithoutTOC := p.extractTOC(html)
-
-	// Sanitize HTML to prevent XSS
-	sanitized := p.sanitizeHTML(contentWithoutTOC)
-
-	// Process links: make external links open in new tab, local links in same tab
-	processed := p.processLinks(sanitized)
-
-	// Also sanitize and process links in TOC
-	tocSanitized := p.sanitizeHTML(toc)
-	tocProcessed := p.processLinks(tocSanitized)
-
-	return &ProcessResult{
-		Content:         processed,
-		TableOfContents: tocProcessed,
-	}, nil
-}
-
-// rewriteLinks rewrites wikilinks and nostr: links in AsciiDoc content
-func (p *Processor) rewriteLinks(content string) string {
-	// Rewrite wikilinks: [[target]] or [[target|display text]]
-	// Format: [[target]] -> https://alexandria.gitcitadel.eu/events?d=<normalized-d-tag>
-	wikilinkRegex := regexp.MustCompile(`\[\[([^\]]+)\]\]`)
-	content = wikilinkRegex.ReplaceAllStringFunc(content, func(match string) string {
-		// Extract the content inside [[ ]]
-		inner := match[2 : len(match)-2]
-
-		var target, display string
-		if strings.Contains(inner, "|") {
-			parts := strings.SplitN(inner, "|", 2)
-			target = strings.TrimSpace(parts[0])
-			display = strings.TrimSpace(parts[1])
-		} else {
-			target = strings.TrimSpace(inner)
-			display = target
-		}
-
-		// Normalize the d tag (convert to lowercase, replace spaces with hyphens, etc.)
-		normalized := normalizeDTag(target)
-
-		// Create the link
-		url := fmt.Sprintf("%s/events?d=%s", p.linkBaseURL, normalized)
-		return fmt.Sprintf("link:%s[%s]", url, display)
-	})
-
-	// Rewrite nostr: links: nostr:naddr1... or nostr:nevent1...
-	// Format: nostr:naddr1... -> https://alexandria.gitcitadel.eu/events?id=naddr1...
-	nostrLinkRegex := regexp.MustCompile(`nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+)`)
-	content = nostrLinkRegex.ReplaceAllStringFunc(content, func(match string) string {
-		nostrID := strings.TrimPrefix(match, "nostr:")
-		url := fmt.Sprintf("%s/events?id=%s", p.linkBaseURL, nostrID)
-		return url
-	})
-
-	return content
-}
-
-// normalizeDTag normalizes a d tag according to NIP-54 rules
-func normalizeDTag(dTag string) string {
-	// Convert to lowercase
-	dTag = strings.ToLower(dTag)
-
-	// Convert whitespace to hyphens
-	dTag = strings.ReplaceAll(dTag, " ", "-")
-	dTag = strings.ReplaceAll(dTag, "\t", "-")
-	dTag = strings.ReplaceAll(dTag, "\n", "-")
-
-	// Remove punctuation and symbols (keep alphanumeric, hyphens, and non-ASCII)
-	var result strings.Builder
-	for _, r := range dTag {
-		if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r > 127 {
-			result.WriteRune(r)
-		}
-	}
-	dTag = result.String()
-
-	// Collapse multiple consecutive hyphens
-	for strings.Contains(dTag, "--") {
-		dTag = strings.ReplaceAll(dTag, "--", "-")
-	}
-
-	// Remove leading and trailing hyphens
-	dTag = strings.Trim(dTag, "-")
-
-	return dTag
-}
-
-// convertToHTML converts AsciiDoc to HTML using asciidoctor.js via Node.js
-func (p *Processor) convertToHTML(asciidocContent string) (string, error) {
+func (p *Processor) Process(content string) (*ProcessResult, error) {
 	// Check if node is available
 	cmd := exec.Command("node", "--version")
 	if err := cmd.Run(); err != nil {
-		return "", fmt.Errorf("node.js not found: %w", err)
+		return nil, fmt.Errorf("node.js not found: %w", err)
 	}

-	// JavaScript code to run asciidoctor.js
-	// Read content from stdin to handle special characters properly
-	jsCode := `
-		const asciidoctor = require('@asciidoctor/core')();
-		
-		let content = '';
-		process.stdin.setEncoding('utf8');
+	// Run gc-parser script
+	cmd = exec.Command("node", p.scriptPath, p.linkBaseURL)
+	cmd.Stdin = strings.NewReader(content)

-		process.stdin.on('data', (chunk) => {
-			content += chunk;
-		});
-		
-		process.stdin.on('end', () => {
-			try {
-				const html = asciidoctor.convert(content, {
-					safe: 'safe',
-					backend: 'html5',
-					doctype: 'article',
-					attributes: {
-						'showtitle': true,
-						'icons': 'font',
-						'sectanchors': true,
-						'sectlinks': true,
-						'toc': 'left',
-						'toclevels': 3
-					}
-				});
-				process.stdout.write(html);
-			} catch (error) {
-				console.error('Error converting AsciiDoc:', error.message);
-				process.exit(1);
-			}
-		});
-	`
-
-	// Run node with the JavaScript code, passing content via stdin
-	cmd = exec.Command("node", "-e", jsCode)
-	cmd.Stdin = strings.NewReader(asciidocContent)
-
-	var stdout, stderr bytes.Buffer
+	var stdout, stderr strings.Builder
 	cmd.Stdout = &stdout
 	cmd.Stderr = &stderr

 	if err := cmd.Run(); err != nil {
-		return "", fmt.Errorf("asciidoctor.js conversion failed: %w, stderr: %s", err, stderr.String())
-	}
-
-	return stdout.String(), nil
-}
-
-// sanitizeHTML performs basic HTML sanitization to prevent XSS
-// Note: This is a basic implementation. For production, consider using a proper HTML sanitizer library
-func (p *Processor) sanitizeHTML(html string) string {
-	// Remove script tags and their content
-	scriptRegex := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`)
-	html = scriptRegex.ReplaceAllString(html, "")
-
-	// Remove event handlers (onclick, onerror, etc.)
-	eventHandlerRegex := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`)
-	html = eventHandlerRegex.ReplaceAllString(html, "")
-
-	// Remove javascript: protocol in links
-	javascriptRegex := regexp.MustCompile(`(?i)javascript:`)
-	html = javascriptRegex.ReplaceAllString(html, "")
-
-	// Remove data: URLs that could be dangerous
-	dataURLRegex := regexp.MustCompile(`(?i)data:\s*text/html`)
-	html = dataURLRegex.ReplaceAllString(html, "")
-
-	return html
-}
-
-// extractTOC extracts the table of contents from AsciiDoc HTML output
-// Returns the TOC HTML and the content HTML without the TOC
-func (p *Processor) extractTOC(html string) (string, string) {
-	// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
-	// We need to match the entire TOC div including nested content
-	// Since divs can be nested, we need to count opening/closing tags
-
-	var tocContent string
-	contentWithoutTOC := html
-
-	// Find the start of the TOC div - try multiple patterns
-	tocStartPatterns := []*regexp.Regexp{
-		// Pattern 1: <div id="toc" class="toc">
-		regexp.MustCompile(`(?i)<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>`),
-		// Pattern 2: <div id="toc">
-		regexp.MustCompile(`(?i)<div\s+id=["']toc["'][^>]*>`),
-		// Pattern 3: <div class="toc">
-		regexp.MustCompile(`(?i)<div\s+class=["']toc["'][^>]*>`),
-		// Pattern 4: <nav id="toc">
-		regexp.MustCompile(`(?i)<nav\s+id=["']toc["'][^>]*>`),
-	}
-
-	var tocStartIdx int = -1
-	var tocStartTag string
-
-	for _, pattern := range tocStartPatterns {
-		loc := pattern.FindStringIndex(html)
-		if loc != nil {
-			tocStartIdx = loc[0]
-			tocStartTag = html[loc[0]:loc[1]]
-			break
-		}
+		return nil, fmt.Errorf("gc-parser failed: %w, stderr: %s", err, stderr.String())
 	}

-	if tocStartIdx == -1 {
-		// No TOC found
-		return "", html
+	// Parse JSON output
+	var result gcParserResult
+	output := stdout.String()
+	if err := json.Unmarshal([]byte(output), &result); err != nil {
+		return nil, fmt.Errorf("failed to parse gc-parser output: %w, output: %s", err, output)
 	}

-	// Find the matching closing tag by counting div tags
-	// Start after the opening tag
-	searchStart := tocStartIdx + len(tocStartTag)
-	depth := 1
-	i := searchStart
-
-	for i < len(html) && depth > 0 {
-		// Look for opening or closing div/nav tags
-		if i+4 < len(html) && html[i:i+4] == "<div" {
-			// Check if it's a closing tag
-			if i+5 < len(html) && html[i+4] == '/' {
-				depth--
-				// Find the end of this closing tag
-				closeIdx := strings.Index(html[i:], ">")
-				if closeIdx == -1 {
-					break
-				}
-				i += closeIdx + 1
-			} else {
-				// Opening tag - find the end
-				closeIdx := strings.Index(html[i:], ">")
-				if closeIdx == -1 {
-					break
-				}
-				// Check if it's self-closing
-				if html[i+closeIdx-1] != '/' {
-					depth++
-				}
-				i += closeIdx + 1
-			}
-		} else if i+5 < len(html) && html[i:i+5] == "</div" {
-			depth--
-			closeIdx := strings.Index(html[i:], ">")
-			if closeIdx == -1 {
-				break
-			}
-			i += closeIdx + 1
-		} else if i+5 < len(html) && html[i:i+5] == "</nav" {
-			depth--
-			closeIdx := strings.Index(html[i:], ">")
-			if closeIdx == -1 {
-				break
+	// Check for error in result
+	if result.Error != "" {
+		return nil, fmt.Errorf("gc-parser error: %s", result.Error)
 	}
-			i += closeIdx + 1
-		} else {
-			i++
-		}
-	}
-
-	if depth == 0 {
-		// Found the matching closing tag
-		tocEndIdx := i
-		// Extract the TOC content (inner HTML)
-		tocFullHTML := html[tocStartIdx:tocEndIdx]
-		// Extract just the inner content (without the outer div tags)
-		innerStart := len(tocStartTag)
-		innerEnd := len(tocFullHTML)
-		// Find the last </div> or </nav>
-		if strings.HasSuffix(tocFullHTML, "</div>") {
-			innerEnd -= 6
-		} else if strings.HasSuffix(tocFullHTML, "</nav>") {
-			innerEnd -= 7
-		}
-		tocContent = strings.TrimSpace(tocFullHTML[innerStart:innerEnd])
-
-		// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
-		toctitlePattern := regexp.MustCompile(`(?s)<div\s+id=["']toctitle["'][^>]*>.*?</div>\s*`)
-		tocContent = toctitlePattern.ReplaceAllString(tocContent, "")
-		tocContent = strings.TrimSpace(tocContent)

-		// Remove the TOC from the content
-		contentWithoutTOC = html[:tocStartIdx] + html[tocEndIdx:]
-	}
-
-	return tocContent, contentWithoutTOC
-}
-
-// processLinks processes HTML links to add target="_blank" to external links
-// External links are those that start with http:// or https:// and don't point to the linkBaseURL domain
-// Local links (including relative links and links to linkBaseURL) open in the same tab
-func (p *Processor) processLinks(html string) string {
-	// Extract domain from linkBaseURL for comparison
-	linkBaseDomain := ""
-	if strings.HasPrefix(p.linkBaseURL, "http://") || strings.HasPrefix(p.linkBaseURL, "https://") {
-		// Extract domain (e.g., "alexandria.gitcitadel.eu" from "https://alexandria.gitcitadel.eu")
-		parts := strings.Split(strings.TrimPrefix(strings.TrimPrefix(p.linkBaseURL, "https://"), "http://"), "/")
-		if len(parts) > 0 {
-			linkBaseDomain = parts[0]
-		}
-	}
-
-	// Regex to match <a> tags with href attributes (more flexible pattern)
-	linkRegex := regexp.MustCompile(`<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>`)
-
-	html = linkRegex.ReplaceAllStringFunc(html, func(match string) string {
-		// Extract href value
-		hrefMatch := regexp.MustCompile(`href\s*=\s*["']([^"']+)["']`)
-		hrefSubmatch := hrefMatch.FindStringSubmatch(match)
-		if len(hrefSubmatch) < 2 {
-			return match // No href found, return as-is
-		}
-		href := hrefSubmatch[1]
-
-		// Check if it's an external link (starts with http:// or https://)
-		isExternal := strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://")
-
-		if isExternal {
-			// Check if it's pointing to our own domain
-			if linkBaseDomain != "" && strings.Contains(href, linkBaseDomain) {
-				// Same domain - open in same tab (remove any existing target attribute)
-				targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
-				match = targetRegex.ReplaceAllString(match, "")
-				return match
-			}
-
-			// External link - add target="_blank" and rel="noopener noreferrer" if not already present
-			if !strings.Contains(match, `target=`) {
-				// Insert before the closing >
-				match = strings.TrimSuffix(match, ">")
-				if !strings.Contains(match, `rel=`) {
-					match += ` target="_blank" rel="noopener noreferrer">`
-				} else {
-					// Update existing rel attribute to include noopener if not present
-					relRegex := regexp.MustCompile(`rel\s*=\s*["']([^"']*)["']`)
-					match = relRegex.ReplaceAllStringFunc(match, func(relMatch string) string {
-						relValue := relRegex.FindStringSubmatch(relMatch)[1]
-						if !strings.Contains(relValue, "noopener") {
-							relValue += " noopener noreferrer"
-						}
-						return `rel="` + strings.TrimSpace(relValue) + `"`
-					})
-					match += ` target="_blank">`
-				}
-			}
-		} else {
-			// Local/relative link - ensure it opens in same tab (remove target if present)
-			targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
-			match = targetRegex.ReplaceAllString(match, "")
-		}
-
-		return match
-	})
-
-	return html
+	return &ProcessResult{
+		Content:            result.Content,
+		TableOfContents:    result.TableOfContents,
+		HasLaTeX:           result.HasLaTeX,
+		HasMusicalNotation: result.HasMusicalNotation,
+	}, nil
 }
--- a/package.json
+++ b/package.json
@ -1,6 +1,5 @@
				@@ -1,6 +1,5 @@
 {
  "dependencies": {
-    "@asciidoctor/core": "^3.0.4",
-    "marked": "^12.0.0"
+    "gc-parser": "file:../gc-parser"
  }
 }
--- a/scripts/process-content.js
+++ b/scripts/process-content.js
@ -0,0 +1,45 @@
				@@ -0,0 +1,45 @@
+#!/usr/bin/env node
+/**
+ * Wrapper script to process content using gc-parser
+ * Called from Go code via exec
+ */
+
+const { Parser } = require('gc-parser');
+
+// Read content from stdin
+let content = '';
+process.stdin.setEncoding('utf8');
+
+process.stdin.on('data', (chunk) => {
+  content += chunk;
+});
+
+process.stdin.on('end', async () => {
+  try {
+    // Parse options from environment or command line args
+    const linkBaseURL = process.env.LINK_BASE_URL || process.argv[2] || '';
+    
+    // Create parser with options
+    const parser = new Parser({
+      linkBaseURL: linkBaseURL,
+      enableAsciiDoc: true,
+      enableMarkdown: true,
+      enableCodeHighlighting: true,
+      enableLaTeX: true,
+      enableMusicalNotation: true,
+      enableNostrAddresses: true,
+    });
+
+    // Process content
+    const result = await parser.process(content);
+
+    // Output as JSON
+    console.log(JSON.stringify(result));
+  } catch (error) {
+    console.error(JSON.stringify({
+      error: error.message,
+      stack: error.stack,
+    }));
+    process.exit(1);
+  }
+});