From f46c78f2322557b1b5e15dacdd1795ee84df190e Mon Sep 17 00:00:00 2001 From: Silberengel Date: Tue, 3 Mar 2026 15:25:31 +0100 Subject: [PATCH] implement links --- Dockerfile | 7 + internal/asciidoc/processor.go | 402 ++++----------------------------- package.json | 3 +- scripts/process-content.js | 45 ++++ 4 files changed, 103 insertions(+), 354 deletions(-) create mode 100644 scripts/process-content.js diff --git a/Dockerfile b/Dockerfile index 5a2950a..6ab52e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,9 +34,16 @@ RUN apk add --no-cache ca-certificates tzdata wget WORKDIR /app # Install Node.js dependencies for AsciiDoc processing +# Note: gc-parser is referenced from ../gc-parser in package.json +# Before building, ensure gc-parser is built: cd ../gc-parser && npm install && npm run build +# Then npm install here will link to the built gc-parser COPY package.json package-lock.json ./ RUN npm ci --only=production +# Copy gc-parser wrapper script +COPY scripts/ ./scripts/ +RUN chmod +x ./scripts/process-content.js + # Copy built binary from builder COPY --from=builder /build/gitcitadel-online /app/gitcitadel-online diff --git a/internal/asciidoc/processor.go b/internal/asciidoc/processor.go index a3b677b..efd1e2b 100644 --- a/internal/asciidoc/processor.go +++ b/internal/asciidoc/processor.go @@ -1,392 +1,90 @@ package asciidoc import ( - "bytes" + "encoding/json" "fmt" "os/exec" - "regexp" + "path/filepath" "strings" ) -// Processor handles AsciiDoc to HTML conversion +// Processor handles content processing using gc-parser type Processor struct { linkBaseURL string + scriptPath string } // ProcessResult contains the processed HTML content and extracted table of contents type ProcessResult struct { - Content string - TableOfContents string + Content string + TableOfContents string + HasLaTeX bool + HasMusicalNotation bool } -// NewProcessor creates a new AsciiDoc processor +// gcParserResult matches the JSON output from gc-parser +type gcParserResult struct { + Content string `json:"content"` + TableOfContents string `json:"tableOfContents"` + HasLaTeX bool `json:"hasLaTeX"` + HasMusicalNotation bool `json:"hasMusicalNotation"` + NostrLinks []interface{} `json:"nostrLinks"` + Wikilinks []interface{} `json:"wikilinks"` + Hashtags []string `json:"hashtags"` + Links []interface{} `json:"links"` + Media []string `json:"media"` + Error string `json:"error,omitempty"` +} + +// NewProcessor creates a new content processor using gc-parser func NewProcessor(linkBaseURL string) *Processor { + // Determine script path relative to the executable + // In production, the script should be in the same directory as the binary + scriptPath := filepath.Join("scripts", "process-content.js") + return &Processor{ linkBaseURL: linkBaseURL, + scriptPath: scriptPath, } } -// Process converts AsciiDoc content to HTML with link rewriting +// Process converts content (AsciiDoc, Markdown, etc.) to HTML using gc-parser // Returns both the content HTML and the extracted table of contents -func (p *Processor) Process(asciidocContent string) (*ProcessResult, error) { - // First, rewrite links in the AsciiDoc content - processedContent := p.rewriteLinks(asciidocContent) - - // Convert AsciiDoc to HTML using asciidoctor CLI - html, err := p.convertToHTML(processedContent) - if err != nil { - return nil, fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err) - } - - // Extract table of contents from HTML - toc, contentWithoutTOC := p.extractTOC(html) - - // Sanitize HTML to prevent XSS - sanitized := p.sanitizeHTML(contentWithoutTOC) - - // Process links: make external links open in new tab, local links in same tab - processed := p.processLinks(sanitized) - - // Also sanitize and process links in TOC - tocSanitized := p.sanitizeHTML(toc) - tocProcessed := p.processLinks(tocSanitized) - - return &ProcessResult{ - Content: processed, - TableOfContents: tocProcessed, - }, nil -} - -// rewriteLinks rewrites wikilinks and nostr: links in AsciiDoc content -func (p *Processor) rewriteLinks(content string) string { - // Rewrite wikilinks: [[target]] or [[target|display text]] - // Format: [[target]] -> https://alexandria.gitcitadel.eu/events?d= - wikilinkRegex := regexp.MustCompile(`\[\[([^\]]+)\]\]`) - content = wikilinkRegex.ReplaceAllStringFunc(content, func(match string) string { - // Extract the content inside [[ ]] - inner := match[2 : len(match)-2] - - var target, display string - if strings.Contains(inner, "|") { - parts := strings.SplitN(inner, "|", 2) - target = strings.TrimSpace(parts[0]) - display = strings.TrimSpace(parts[1]) - } else { - target = strings.TrimSpace(inner) - display = target - } - - // Normalize the d tag (convert to lowercase, replace spaces with hyphens, etc.) - normalized := normalizeDTag(target) - - // Create the link - url := fmt.Sprintf("%s/events?d=%s", p.linkBaseURL, normalized) - return fmt.Sprintf("link:%s[%s]", url, display) - }) - - // Rewrite nostr: links: nostr:naddr1... or nostr:nevent1... - // Format: nostr:naddr1... -> https://alexandria.gitcitadel.eu/events?id=naddr1... - nostrLinkRegex := regexp.MustCompile(`nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+)`) - content = nostrLinkRegex.ReplaceAllStringFunc(content, func(match string) string { - nostrID := strings.TrimPrefix(match, "nostr:") - url := fmt.Sprintf("%s/events?id=%s", p.linkBaseURL, nostrID) - return url - }) - - return content -} - -// normalizeDTag normalizes a d tag according to NIP-54 rules -func normalizeDTag(dTag string) string { - // Convert to lowercase - dTag = strings.ToLower(dTag) - - // Convert whitespace to hyphens - dTag = strings.ReplaceAll(dTag, " ", "-") - dTag = strings.ReplaceAll(dTag, "\t", "-") - dTag = strings.ReplaceAll(dTag, "\n", "-") - - // Remove punctuation and symbols (keep alphanumeric, hyphens, and non-ASCII) - var result strings.Builder - for _, r := range dTag { - if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r > 127 { - result.WriteRune(r) - } - } - dTag = result.String() - - // Collapse multiple consecutive hyphens - for strings.Contains(dTag, "--") { - dTag = strings.ReplaceAll(dTag, "--", "-") - } - - // Remove leading and trailing hyphens - dTag = strings.Trim(dTag, "-") - - return dTag -} - -// convertToHTML converts AsciiDoc to HTML using asciidoctor.js via Node.js -func (p *Processor) convertToHTML(asciidocContent string) (string, error) { +func (p *Processor) Process(content string) (*ProcessResult, error) { // Check if node is available cmd := exec.Command("node", "--version") if err := cmd.Run(); err != nil { - return "", fmt.Errorf("node.js not found: %w", err) + return nil, fmt.Errorf("node.js not found: %w", err) } - // JavaScript code to run asciidoctor.js - // Read content from stdin to handle special characters properly - jsCode := ` - const asciidoctor = require('@asciidoctor/core')(); - - let content = ''; - process.stdin.setEncoding('utf8'); - - process.stdin.on('data', (chunk) => { - content += chunk; - }); - - process.stdin.on('end', () => { - try { - const html = asciidoctor.convert(content, { - safe: 'safe', - backend: 'html5', - doctype: 'article', - attributes: { - 'showtitle': true, - 'icons': 'font', - 'sectanchors': true, - 'sectlinks': true, - 'toc': 'left', - 'toclevels': 3 - } - }); - process.stdout.write(html); - } catch (error) { - console.error('Error converting AsciiDoc:', error.message); - process.exit(1); - } - }); - ` + // Run gc-parser script + cmd = exec.Command("node", p.scriptPath, p.linkBaseURL) + cmd.Stdin = strings.NewReader(content) - // Run node with the JavaScript code, passing content via stdin - cmd = exec.Command("node", "-e", jsCode) - cmd.Stdin = strings.NewReader(asciidocContent) - - var stdout, stderr bytes.Buffer + var stdout, stderr strings.Builder cmd.Stdout = &stdout cmd.Stderr = &stderr if err := cmd.Run(); err != nil { - return "", fmt.Errorf("asciidoctor.js conversion failed: %w, stderr: %s", err, stderr.String()) + return nil, fmt.Errorf("gc-parser failed: %w, stderr: %s", err, stderr.String()) } - return stdout.String(), nil -} - -// sanitizeHTML performs basic HTML sanitization to prevent XSS -// Note: This is a basic implementation. For production, consider using a proper HTML sanitizer library -func (p *Processor) sanitizeHTML(html string) string { - // Remove script tags and their content - scriptRegex := regexp.MustCompile(`(?i)]*>.*?`) - html = scriptRegex.ReplaceAllString(html, "") - - // Remove event handlers (onclick, onerror, etc.) - eventHandlerRegex := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`) - html = eventHandlerRegex.ReplaceAllString(html, "") - - // Remove javascript: protocol in links - javascriptRegex := regexp.MustCompile(`(?i)javascript:`) - html = javascriptRegex.ReplaceAllString(html, "") - - // Remove data: URLs that could be dangerous - dataURLRegex := regexp.MustCompile(`(?i)data:\s*text/html`) - html = dataURLRegex.ReplaceAllString(html, "") - - return html -} - -// extractTOC extracts the table of contents from AsciiDoc HTML output -// Returns the TOC HTML and the content HTML without the TOC -func (p *Processor) extractTOC(html string) (string, string) { - // AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc" - // We need to match the entire TOC div including nested content - // Since divs can be nested, we need to count opening/closing tags - - var tocContent string - contentWithoutTOC := html - - // Find the start of the TOC div - try multiple patterns - tocStartPatterns := []*regexp.Regexp{ - // Pattern 1:
- regexp.MustCompile(`(?i)]*>`), - // Pattern 2:
- regexp.MustCompile(`(?i)]*>`), - // Pattern 3:
- regexp.MustCompile(`(?i)]*>`), - // Pattern 4:
or - if strings.HasSuffix(tocFullHTML, "
") { - innerEnd -= 6 - } else if strings.HasSuffix(tocFullHTML, "") { - innerEnd -= 7 - } - tocContent = strings.TrimSpace(tocFullHTML[innerStart:innerEnd]) - - // Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title) - toctitlePattern := regexp.MustCompile(`(?s)]*>.*?
\s*`) - tocContent = toctitlePattern.ReplaceAllString(tocContent, "") - tocContent = strings.TrimSpace(tocContent) - - // Remove the TOC from the content - contentWithoutTOC = html[:tocStartIdx] + html[tocEndIdx:] - } - - return tocContent, contentWithoutTOC -} - -// processLinks processes HTML links to add target="_blank" to external links -// External links are those that start with http:// or https:// and don't point to the linkBaseURL domain -// Local links (including relative links and links to linkBaseURL) open in the same tab -func (p *Processor) processLinks(html string) string { - // Extract domain from linkBaseURL for comparison - linkBaseDomain := "" - if strings.HasPrefix(p.linkBaseURL, "http://") || strings.HasPrefix(p.linkBaseURL, "https://") { - // Extract domain (e.g., "alexandria.gitcitadel.eu" from "https://alexandria.gitcitadel.eu") - parts := strings.Split(strings.TrimPrefix(strings.TrimPrefix(p.linkBaseURL, "https://"), "http://"), "/") - if len(parts) > 0 { - linkBaseDomain = parts[0] - } - } - - // Regex to match tags with href attributes (more flexible pattern) - linkRegex := regexp.MustCompile(`]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>`) - - html = linkRegex.ReplaceAllStringFunc(html, func(match string) string { - // Extract href value - hrefMatch := regexp.MustCompile(`href\s*=\s*["']([^"']+)["']`) - hrefSubmatch := hrefMatch.FindStringSubmatch(match) - if len(hrefSubmatch) < 2 { - return match // No href found, return as-is - } - href := hrefSubmatch[1] - - // Check if it's an external link (starts with http:// or https://) - isExternal := strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") - - if isExternal { - // Check if it's pointing to our own domain - if linkBaseDomain != "" && strings.Contains(href, linkBaseDomain) { - // Same domain - open in same tab (remove any existing target attribute) - targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`) - match = targetRegex.ReplaceAllString(match, "") - return match - } - - // External link - add target="_blank" and rel="noopener noreferrer" if not already present - if !strings.Contains(match, `target=`) { - // Insert before the closing > - match = strings.TrimSuffix(match, ">") - if !strings.Contains(match, `rel=`) { - match += ` target="_blank" rel="noopener noreferrer">` - } else { - // Update existing rel attribute to include noopener if not present - relRegex := regexp.MustCompile(`rel\s*=\s*["']([^"']*)["']`) - match = relRegex.ReplaceAllStringFunc(match, func(relMatch string) string { - relValue := relRegex.FindStringSubmatch(relMatch)[1] - if !strings.Contains(relValue, "noopener") { - relValue += " noopener noreferrer" - } - return `rel="` + strings.TrimSpace(relValue) + `"` - }) - match += ` target="_blank">` - } - } - } else { - // Local/relative link - ensure it opens in same tab (remove target if present) - targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`) - match = targetRegex.ReplaceAllString(match, "") - } - - return match - }) - - return html + return &ProcessResult{ + Content: result.Content, + TableOfContents: result.TableOfContents, + HasLaTeX: result.HasLaTeX, + HasMusicalNotation: result.HasMusicalNotation, + }, nil } diff --git a/package.json b/package.json index 49a1589..e525385 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,5 @@ { "dependencies": { - "@asciidoctor/core": "^3.0.4", - "marked": "^12.0.0" + "gc-parser": "file:../gc-parser" } } diff --git a/scripts/process-content.js b/scripts/process-content.js new file mode 100644 index 0000000..b0a5a18 --- /dev/null +++ b/scripts/process-content.js @@ -0,0 +1,45 @@ +#!/usr/bin/env node +/** + * Wrapper script to process content using gc-parser + * Called from Go code via exec + */ + +const { Parser } = require('gc-parser'); + +// Read content from stdin +let content = ''; +process.stdin.setEncoding('utf8'); + +process.stdin.on('data', (chunk) => { + content += chunk; +}); + +process.stdin.on('end', async () => { + try { + // Parse options from environment or command line args + const linkBaseURL = process.env.LINK_BASE_URL || process.argv[2] || ''; + + // Create parser with options + const parser = new Parser({ + linkBaseURL: linkBaseURL, + enableAsciiDoc: true, + enableMarkdown: true, + enableCodeHighlighting: true, + enableLaTeX: true, + enableMusicalNotation: true, + enableNostrAddresses: true, + }); + + // Process content + const result = await parser.process(content); + + // Output as JSON + console.log(JSON.stringify(result)); + } catch (error) { + console.error(JSON.stringify({ + error: error.message, + stack: error.stack, + })); + process.exit(1); + } +});