4 changed files with 103 additions and 354 deletions
@ -1,392 +1,90 @@
@@ -1,392 +1,90 @@
|
||||
package asciidoc |
||||
|
||||
import ( |
||||
"bytes" |
||||
"encoding/json" |
||||
"fmt" |
||||
"os/exec" |
||||
"regexp" |
||||
"path/filepath" |
||||
"strings" |
||||
) |
||||
|
||||
// Processor handles AsciiDoc to HTML conversion
|
||||
// Processor handles content processing using gc-parser
|
||||
type Processor struct { |
||||
linkBaseURL string |
||||
scriptPath string |
||||
} |
||||
|
||||
// ProcessResult contains the processed HTML content and extracted table of contents
|
||||
type ProcessResult struct { |
||||
Content string |
||||
TableOfContents string |
||||
HasLaTeX bool |
||||
HasMusicalNotation bool |
||||
} |
||||
|
||||
// NewProcessor creates a new AsciiDoc processor
|
||||
// gcParserResult matches the JSON output from gc-parser
|
||||
type gcParserResult struct { |
||||
Content string `json:"content"` |
||||
TableOfContents string `json:"tableOfContents"` |
||||
HasLaTeX bool `json:"hasLaTeX"` |
||||
HasMusicalNotation bool `json:"hasMusicalNotation"` |
||||
NostrLinks []interface{} `json:"nostrLinks"` |
||||
Wikilinks []interface{} `json:"wikilinks"` |
||||
Hashtags []string `json:"hashtags"` |
||||
Links []interface{} `json:"links"` |
||||
Media []string `json:"media"` |
||||
Error string `json:"error,omitempty"` |
||||
} |
||||
|
||||
// NewProcessor creates a new content processor using gc-parser
|
||||
func NewProcessor(linkBaseURL string) *Processor { |
||||
// Determine script path relative to the executable
|
||||
// In production, the script should be in the same directory as the binary
|
||||
scriptPath := filepath.Join("scripts", "process-content.js") |
||||
|
||||
return &Processor{ |
||||
linkBaseURL: linkBaseURL, |
||||
scriptPath: scriptPath, |
||||
} |
||||
} |
||||
|
||||
// Process converts AsciiDoc content to HTML with link rewriting
|
||||
// Process converts content (AsciiDoc, Markdown, etc.) to HTML using gc-parser
|
||||
// Returns both the content HTML and the extracted table of contents
|
||||
func (p *Processor) Process(asciidocContent string) (*ProcessResult, error) { |
||||
// First, rewrite links in the AsciiDoc content
|
||||
processedContent := p.rewriteLinks(asciidocContent) |
||||
|
||||
// Convert AsciiDoc to HTML using asciidoctor CLI
|
||||
html, err := p.convertToHTML(processedContent) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err) |
||||
} |
||||
|
||||
// Extract table of contents from HTML
|
||||
toc, contentWithoutTOC := p.extractTOC(html) |
||||
|
||||
// Sanitize HTML to prevent XSS
|
||||
sanitized := p.sanitizeHTML(contentWithoutTOC) |
||||
|
||||
// Process links: make external links open in new tab, local links in same tab
|
||||
processed := p.processLinks(sanitized) |
||||
|
||||
// Also sanitize and process links in TOC
|
||||
tocSanitized := p.sanitizeHTML(toc) |
||||
tocProcessed := p.processLinks(tocSanitized) |
||||
|
||||
return &ProcessResult{ |
||||
Content: processed, |
||||
TableOfContents: tocProcessed, |
||||
}, nil |
||||
} |
||||
|
||||
// rewriteLinks rewrites wikilinks and nostr: links in AsciiDoc content
|
||||
func (p *Processor) rewriteLinks(content string) string { |
||||
// Rewrite wikilinks: [[target]] or [[target|display text]]
|
||||
// Format: [[target]] -> https://alexandria.gitcitadel.eu/events?d=<normalized-d-tag>
|
||||
wikilinkRegex := regexp.MustCompile(`\[\[([^\]]+)\]\]`) |
||||
content = wikilinkRegex.ReplaceAllStringFunc(content, func(match string) string { |
||||
// Extract the content inside [[ ]]
|
||||
inner := match[2 : len(match)-2] |
||||
|
||||
var target, display string |
||||
if strings.Contains(inner, "|") { |
||||
parts := strings.SplitN(inner, "|", 2) |
||||
target = strings.TrimSpace(parts[0]) |
||||
display = strings.TrimSpace(parts[1]) |
||||
} else { |
||||
target = strings.TrimSpace(inner) |
||||
display = target |
||||
} |
||||
|
||||
// Normalize the d tag (convert to lowercase, replace spaces with hyphens, etc.)
|
||||
normalized := normalizeDTag(target) |
||||
|
||||
// Create the link
|
||||
url := fmt.Sprintf("%s/events?d=%s", p.linkBaseURL, normalized) |
||||
return fmt.Sprintf("link:%s[%s]", url, display) |
||||
}) |
||||
|
||||
// Rewrite nostr: links: nostr:naddr1... or nostr:nevent1...
|
||||
// Format: nostr:naddr1... -> https://alexandria.gitcitadel.eu/events?id=naddr1...
|
||||
nostrLinkRegex := regexp.MustCompile(`nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+)`) |
||||
content = nostrLinkRegex.ReplaceAllStringFunc(content, func(match string) string { |
||||
nostrID := strings.TrimPrefix(match, "nostr:") |
||||
url := fmt.Sprintf("%s/events?id=%s", p.linkBaseURL, nostrID) |
||||
return url |
||||
}) |
||||
|
||||
return content |
||||
} |
||||
|
||||
// normalizeDTag normalizes a d tag according to NIP-54 rules
|
||||
func normalizeDTag(dTag string) string { |
||||
// Convert to lowercase
|
||||
dTag = strings.ToLower(dTag) |
||||
|
||||
// Convert whitespace to hyphens
|
||||
dTag = strings.ReplaceAll(dTag, " ", "-") |
||||
dTag = strings.ReplaceAll(dTag, "\t", "-") |
||||
dTag = strings.ReplaceAll(dTag, "\n", "-") |
||||
|
||||
// Remove punctuation and symbols (keep alphanumeric, hyphens, and non-ASCII)
|
||||
var result strings.Builder |
||||
for _, r := range dTag { |
||||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r > 127 { |
||||
result.WriteRune(r) |
||||
} |
||||
} |
||||
dTag = result.String() |
||||
|
||||
// Collapse multiple consecutive hyphens
|
||||
for strings.Contains(dTag, "--") { |
||||
dTag = strings.ReplaceAll(dTag, "--", "-") |
||||
} |
||||
|
||||
// Remove leading and trailing hyphens
|
||||
dTag = strings.Trim(dTag, "-") |
||||
|
||||
return dTag |
||||
} |
||||
|
||||
// convertToHTML converts AsciiDoc to HTML using asciidoctor.js via Node.js
|
||||
func (p *Processor) convertToHTML(asciidocContent string) (string, error) { |
||||
func (p *Processor) Process(content string) (*ProcessResult, error) { |
||||
// Check if node is available
|
||||
cmd := exec.Command("node", "--version") |
||||
if err := cmd.Run(); err != nil { |
||||
return "", fmt.Errorf("node.js not found: %w", err) |
||||
return nil, fmt.Errorf("node.js not found: %w", err) |
||||
} |
||||
|
||||
// JavaScript code to run asciidoctor.js
|
||||
// Read content from stdin to handle special characters properly
|
||||
jsCode := ` |
||||
const asciidoctor = require('@asciidoctor/core')(); |
||||
|
||||
let content = ''; |
||||
process.stdin.setEncoding('utf8'); |
||||
// Run gc-parser script
|
||||
cmd = exec.Command("node", p.scriptPath, p.linkBaseURL) |
||||
cmd.Stdin = strings.NewReader(content) |
||||
|
||||
process.stdin.on('data', (chunk) => { |
||||
content += chunk; |
||||
}); |
||||
|
||||
process.stdin.on('end', () => { |
||||
try { |
||||
const html = asciidoctor.convert(content, { |
||||
safe: 'safe', |
||||
backend: 'html5', |
||||
doctype: 'article', |
||||
attributes: { |
||||
'showtitle': true, |
||||
'icons': 'font', |
||||
'sectanchors': true, |
||||
'sectlinks': true, |
||||
'toc': 'left', |
||||
'toclevels': 3 |
||||
} |
||||
}); |
||||
process.stdout.write(html); |
||||
} catch (error) { |
||||
console.error('Error converting AsciiDoc:', error.message); |
||||
process.exit(1); |
||||
} |
||||
}); |
||||
` |
||||
|
||||
// Run node with the JavaScript code, passing content via stdin
|
||||
cmd = exec.Command("node", "-e", jsCode) |
||||
cmd.Stdin = strings.NewReader(asciidocContent) |
||||
|
||||
var stdout, stderr bytes.Buffer |
||||
var stdout, stderr strings.Builder |
||||
cmd.Stdout = &stdout |
||||
cmd.Stderr = &stderr |
||||
|
||||
if err := cmd.Run(); err != nil { |
||||
return "", fmt.Errorf("asciidoctor.js conversion failed: %w, stderr: %s", err, stderr.String()) |
||||
} |
||||
|
||||
return stdout.String(), nil |
||||
} |
||||
|
||||
// sanitizeHTML performs basic HTML sanitization to prevent XSS
|
||||
// Note: This is a basic implementation. For production, consider using a proper HTML sanitizer library
|
||||
func (p *Processor) sanitizeHTML(html string) string { |
||||
// Remove script tags and their content
|
||||
scriptRegex := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`) |
||||
html = scriptRegex.ReplaceAllString(html, "") |
||||
|
||||
// Remove event handlers (onclick, onerror, etc.)
|
||||
eventHandlerRegex := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`) |
||||
html = eventHandlerRegex.ReplaceAllString(html, "") |
||||
|
||||
// Remove javascript: protocol in links
|
||||
javascriptRegex := regexp.MustCompile(`(?i)javascript:`) |
||||
html = javascriptRegex.ReplaceAllString(html, "") |
||||
|
||||
// Remove data: URLs that could be dangerous
|
||||
dataURLRegex := regexp.MustCompile(`(?i)data:\s*text/html`) |
||||
html = dataURLRegex.ReplaceAllString(html, "") |
||||
|
||||
return html |
||||
} |
||||
|
||||
// extractTOC extracts the table of contents from AsciiDoc HTML output
|
||||
// Returns the TOC HTML and the content HTML without the TOC
|
||||
func (p *Processor) extractTOC(html string) (string, string) { |
||||
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
|
||||
// We need to match the entire TOC div including nested content
|
||||
// Since divs can be nested, we need to count opening/closing tags
|
||||
|
||||
var tocContent string |
||||
contentWithoutTOC := html |
||||
|
||||
// Find the start of the TOC div - try multiple patterns
|
||||
tocStartPatterns := []*regexp.Regexp{ |
||||
// Pattern 1: <div id="toc" class="toc">
|
||||
regexp.MustCompile(`(?i)<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>`), |
||||
// Pattern 2: <div id="toc">
|
||||
regexp.MustCompile(`(?i)<div\s+id=["']toc["'][^>]*>`), |
||||
// Pattern 3: <div class="toc">
|
||||
regexp.MustCompile(`(?i)<div\s+class=["']toc["'][^>]*>`), |
||||
// Pattern 4: <nav id="toc">
|
||||
regexp.MustCompile(`(?i)<nav\s+id=["']toc["'][^>]*>`), |
||||
} |
||||
|
||||
var tocStartIdx int = -1 |
||||
var tocStartTag string |
||||
|
||||
for _, pattern := range tocStartPatterns { |
||||
loc := pattern.FindStringIndex(html) |
||||
if loc != nil { |
||||
tocStartIdx = loc[0] |
||||
tocStartTag = html[loc[0]:loc[1]] |
||||
break |
||||
} |
||||
return nil, fmt.Errorf("gc-parser failed: %w, stderr: %s", err, stderr.String()) |
||||
} |
||||
|
||||
if tocStartIdx == -1 { |
||||
// No TOC found
|
||||
return "", html |
||||
// Parse JSON output
|
||||
var result gcParserResult |
||||
output := stdout.String() |
||||
if err := json.Unmarshal([]byte(output), &result); err != nil { |
||||
return nil, fmt.Errorf("failed to parse gc-parser output: %w, output: %s", err, output) |
||||
} |
||||
|
||||
// Find the matching closing tag by counting div tags
|
||||
// Start after the opening tag
|
||||
searchStart := tocStartIdx + len(tocStartTag) |
||||
depth := 1 |
||||
i := searchStart |
||||
|
||||
for i < len(html) && depth > 0 { |
||||
// Look for opening or closing div/nav tags
|
||||
if i+4 < len(html) && html[i:i+4] == "<div" { |
||||
// Check if it's a closing tag
|
||||
if i+5 < len(html) && html[i+4] == '/' { |
||||
depth-- |
||||
// Find the end of this closing tag
|
||||
closeIdx := strings.Index(html[i:], ">") |
||||
if closeIdx == -1 { |
||||
break |
||||
} |
||||
i += closeIdx + 1 |
||||
} else { |
||||
// Opening tag - find the end
|
||||
closeIdx := strings.Index(html[i:], ">") |
||||
if closeIdx == -1 { |
||||
break |
||||
} |
||||
// Check if it's self-closing
|
||||
if html[i+closeIdx-1] != '/' { |
||||
depth++ |
||||
} |
||||
i += closeIdx + 1 |
||||
} |
||||
} else if i+5 < len(html) && html[i:i+5] == "</div" { |
||||
depth-- |
||||
closeIdx := strings.Index(html[i:], ">") |
||||
if closeIdx == -1 { |
||||
break |
||||
} |
||||
i += closeIdx + 1 |
||||
} else if i+5 < len(html) && html[i:i+5] == "</nav" { |
||||
depth-- |
||||
closeIdx := strings.Index(html[i:], ">") |
||||
if closeIdx == -1 { |
||||
break |
||||
// Check for error in result
|
||||
if result.Error != "" { |
||||
return nil, fmt.Errorf("gc-parser error: %s", result.Error) |
||||
} |
||||
i += closeIdx + 1 |
||||
} else { |
||||
i++ |
||||
} |
||||
} |
||||
|
||||
if depth == 0 { |
||||
// Found the matching closing tag
|
||||
tocEndIdx := i |
||||
// Extract the TOC content (inner HTML)
|
||||
tocFullHTML := html[tocStartIdx:tocEndIdx] |
||||
// Extract just the inner content (without the outer div tags)
|
||||
innerStart := len(tocStartTag) |
||||
innerEnd := len(tocFullHTML) |
||||
// Find the last </div> or </nav>
|
||||
if strings.HasSuffix(tocFullHTML, "</div>") { |
||||
innerEnd -= 6 |
||||
} else if strings.HasSuffix(tocFullHTML, "</nav>") { |
||||
innerEnd -= 7 |
||||
} |
||||
tocContent = strings.TrimSpace(tocFullHTML[innerStart:innerEnd]) |
||||
|
||||
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
|
||||
toctitlePattern := regexp.MustCompile(`(?s)<div\s+id=["']toctitle["'][^>]*>.*?</div>\s*`) |
||||
tocContent = toctitlePattern.ReplaceAllString(tocContent, "") |
||||
tocContent = strings.TrimSpace(tocContent) |
||||
|
||||
// Remove the TOC from the content
|
||||
contentWithoutTOC = html[:tocStartIdx] + html[tocEndIdx:] |
||||
} |
||||
|
||||
return tocContent, contentWithoutTOC |
||||
} |
||||
|
||||
// processLinks processes HTML links to add target="_blank" to external links
|
||||
// External links are those that start with http:// or https:// and don't point to the linkBaseURL domain
|
||||
// Local links (including relative links and links to linkBaseURL) open in the same tab
|
||||
func (p *Processor) processLinks(html string) string { |
||||
// Extract domain from linkBaseURL for comparison
|
||||
linkBaseDomain := "" |
||||
if strings.HasPrefix(p.linkBaseURL, "http://") || strings.HasPrefix(p.linkBaseURL, "https://") { |
||||
// Extract domain (e.g., "alexandria.gitcitadel.eu" from "https://alexandria.gitcitadel.eu")
|
||||
parts := strings.Split(strings.TrimPrefix(strings.TrimPrefix(p.linkBaseURL, "https://"), "http://"), "/") |
||||
if len(parts) > 0 { |
||||
linkBaseDomain = parts[0] |
||||
} |
||||
} |
||||
|
||||
// Regex to match <a> tags with href attributes (more flexible pattern)
|
||||
linkRegex := regexp.MustCompile(`<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>`) |
||||
|
||||
html = linkRegex.ReplaceAllStringFunc(html, func(match string) string { |
||||
// Extract href value
|
||||
hrefMatch := regexp.MustCompile(`href\s*=\s*["']([^"']+)["']`) |
||||
hrefSubmatch := hrefMatch.FindStringSubmatch(match) |
||||
if len(hrefSubmatch) < 2 { |
||||
return match // No href found, return as-is
|
||||
} |
||||
href := hrefSubmatch[1] |
||||
|
||||
// Check if it's an external link (starts with http:// or https://)
|
||||
isExternal := strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") |
||||
|
||||
if isExternal { |
||||
// Check if it's pointing to our own domain
|
||||
if linkBaseDomain != "" && strings.Contains(href, linkBaseDomain) { |
||||
// Same domain - open in same tab (remove any existing target attribute)
|
||||
targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`) |
||||
match = targetRegex.ReplaceAllString(match, "") |
||||
return match |
||||
} |
||||
|
||||
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
|
||||
if !strings.Contains(match, `target=`) { |
||||
// Insert before the closing >
|
||||
match = strings.TrimSuffix(match, ">") |
||||
if !strings.Contains(match, `rel=`) { |
||||
match += ` target="_blank" rel="noopener noreferrer">` |
||||
} else { |
||||
// Update existing rel attribute to include noopener if not present
|
||||
relRegex := regexp.MustCompile(`rel\s*=\s*["']([^"']*)["']`) |
||||
match = relRegex.ReplaceAllStringFunc(match, func(relMatch string) string { |
||||
relValue := relRegex.FindStringSubmatch(relMatch)[1] |
||||
if !strings.Contains(relValue, "noopener") { |
||||
relValue += " noopener noreferrer" |
||||
} |
||||
return `rel="` + strings.TrimSpace(relValue) + `"` |
||||
}) |
||||
match += ` target="_blank">` |
||||
} |
||||
} |
||||
} else { |
||||
// Local/relative link - ensure it opens in same tab (remove target if present)
|
||||
targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`) |
||||
match = targetRegex.ReplaceAllString(match, "") |
||||
} |
||||
|
||||
return match |
||||
}) |
||||
|
||||
return html |
||||
return &ProcessResult{ |
||||
Content: result.Content, |
||||
TableOfContents: result.TableOfContents, |
||||
HasLaTeX: result.HasLaTeX, |
||||
HasMusicalNotation: result.HasMusicalNotation, |
||||
}, nil |
||||
} |
||||
|
||||
@ -1,6 +1,5 @@
@@ -1,6 +1,5 @@
|
||||
{ |
||||
"dependencies": { |
||||
"@asciidoctor/core": "^3.0.4", |
||||
"marked": "^12.0.0" |
||||
"gc-parser": "file:../gc-parser" |
||||
} |
||||
} |
||||
|
||||
@ -0,0 +1,45 @@
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env node
|
||||
/** |
||||
* Wrapper script to process content using gc-parser |
||||
* Called from Go code via exec |
||||
*/ |
||||
|
||||
const { Parser } = require('gc-parser'); |
||||
|
||||
// Read content from stdin
|
||||
let content = ''; |
||||
process.stdin.setEncoding('utf8'); |
||||
|
||||
process.stdin.on('data', (chunk) => { |
||||
content += chunk; |
||||
}); |
||||
|
||||
process.stdin.on('end', async () => { |
||||
try { |
||||
// Parse options from environment or command line args
|
||||
const linkBaseURL = process.env.LINK_BASE_URL || process.argv[2] || ''; |
||||
|
||||
// Create parser with options
|
||||
const parser = new Parser({ |
||||
linkBaseURL: linkBaseURL, |
||||
enableAsciiDoc: true, |
||||
enableMarkdown: true, |
||||
enableCodeHighlighting: true, |
||||
enableLaTeX: true, |
||||
enableMusicalNotation: true, |
||||
enableNostrAddresses: true, |
||||
}); |
||||
|
||||
// Process content
|
||||
const result = await parser.process(content); |
||||
|
||||
// Output as JSON
|
||||
console.log(JSON.stringify(result)); |
||||
} catch (error) { |
||||
console.error(JSON.stringify({ |
||||
error: error.message, |
||||
stack: error.stack, |
||||
})); |
||||
process.exit(1); |
||||
} |
||||
}); |
||||
Loading…
Reference in new issue