You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

392 lines
12 KiB

package asciidoc
import (
"bytes"
"fmt"
"os/exec"
"regexp"
"strings"
)
// Processor handles AsciiDoc to HTML conversion
type Processor struct {
linkBaseURL string
}
// ProcessResult contains the processed HTML content and extracted table of contents
type ProcessResult struct {
Content string
TableOfContents string
}
// NewProcessor creates a new AsciiDoc processor
func NewProcessor(linkBaseURL string) *Processor {
return &Processor{
linkBaseURL: linkBaseURL,
}
}
// Process converts AsciiDoc content to HTML with link rewriting
// Returns both the content HTML and the extracted table of contents
func (p *Processor) Process(asciidocContent string) (*ProcessResult, error) {
// First, rewrite links in the AsciiDoc content
processedContent := p.rewriteLinks(asciidocContent)
// Convert AsciiDoc to HTML using asciidoctor CLI
html, err := p.convertToHTML(processedContent)
if err != nil {
return nil, fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err)
}
// Extract table of contents from HTML
toc, contentWithoutTOC := p.extractTOC(html)
// Sanitize HTML to prevent XSS
sanitized := p.sanitizeHTML(contentWithoutTOC)
// Process links: make external links open in new tab, local links in same tab
processed := p.processLinks(sanitized)
// Also sanitize and process links in TOC
tocSanitized := p.sanitizeHTML(toc)
tocProcessed := p.processLinks(tocSanitized)
return &ProcessResult{
Content: processed,
TableOfContents: tocProcessed,
}, nil
}
// rewriteLinks rewrites wikilinks and nostr: links in AsciiDoc content
func (p *Processor) rewriteLinks(content string) string {
// Rewrite wikilinks: [[target]] or [[target|display text]]
// Format: [[target]] -> https://alexandria.gitcitadel.eu/events?d=<normalized-d-tag>
wikilinkRegex := regexp.MustCompile(`\[\[([^\]]+)\]\]`)
content = wikilinkRegex.ReplaceAllStringFunc(content, func(match string) string {
// Extract the content inside [[ ]]
inner := match[2 : len(match)-2]
var target, display string
if strings.Contains(inner, "|") {
parts := strings.SplitN(inner, "|", 2)
target = strings.TrimSpace(parts[0])
display = strings.TrimSpace(parts[1])
} else {
target = strings.TrimSpace(inner)
display = target
}
// Normalize the d tag (convert to lowercase, replace spaces with hyphens, etc.)
normalized := normalizeDTag(target)
// Create the link
url := fmt.Sprintf("%s/events?d=%s", p.linkBaseURL, normalized)
return fmt.Sprintf("link:%s[%s]", url, display)
})
// Rewrite nostr: links: nostr:naddr1... or nostr:nevent1...
// Format: nostr:naddr1... -> https://alexandria.gitcitadel.eu/events?id=naddr1...
nostrLinkRegex := regexp.MustCompile(`nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+)`)
content = nostrLinkRegex.ReplaceAllStringFunc(content, func(match string) string {
nostrID := strings.TrimPrefix(match, "nostr:")
url := fmt.Sprintf("%s/events?id=%s", p.linkBaseURL, nostrID)
return url
})
return content
}
// normalizeDTag normalizes a d tag according to NIP-54 rules
func normalizeDTag(dTag string) string {
// Convert to lowercase
dTag = strings.ToLower(dTag)
// Convert whitespace to hyphens
dTag = strings.ReplaceAll(dTag, " ", "-")
dTag = strings.ReplaceAll(dTag, "\t", "-")
dTag = strings.ReplaceAll(dTag, "\n", "-")
// Remove punctuation and symbols (keep alphanumeric, hyphens, and non-ASCII)
var result strings.Builder
for _, r := range dTag {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r > 127 {
result.WriteRune(r)
}
}
dTag = result.String()
// Collapse multiple consecutive hyphens
for strings.Contains(dTag, "--") {
dTag = strings.ReplaceAll(dTag, "--", "-")
}
// Remove leading and trailing hyphens
dTag = strings.Trim(dTag, "-")
return dTag
}
// convertToHTML converts AsciiDoc to HTML using asciidoctor.js via Node.js
func (p *Processor) convertToHTML(asciidocContent string) (string, error) {
// Check if node is available
cmd := exec.Command("node", "--version")
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("node.js not found: %w", err)
}
// JavaScript code to run asciidoctor.js
// Read content from stdin to handle special characters properly
jsCode := `
const asciidoctor = require('@asciidoctor/core')();
let content = '';
process.stdin.setEncoding('utf8');
process.stdin.on('data', (chunk) => {
content += chunk;
});
process.stdin.on('end', () => {
try {
const html = asciidoctor.convert(content, {
safe: 'safe',
backend: 'html5',
doctype: 'article',
attributes: {
'showtitle': true,
'icons': 'font',
'sectanchors': true,
'sectlinks': true,
'toc': 'left',
'toclevels': 3
}
});
process.stdout.write(html);
} catch (error) {
console.error('Error converting AsciiDoc:', error.message);
process.exit(1);
}
});
`
// Run node with the JavaScript code, passing content via stdin
cmd = exec.Command("node", "-e", jsCode)
cmd.Stdin = strings.NewReader(asciidocContent)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("asciidoctor.js conversion failed: %w, stderr: %s", err, stderr.String())
}
return stdout.String(), nil
}
// sanitizeHTML performs basic HTML sanitization to prevent XSS
// Note: This is a basic implementation. For production, consider using a proper HTML sanitizer library
func (p *Processor) sanitizeHTML(html string) string {
// Remove script tags and their content
scriptRegex := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`)
html = scriptRegex.ReplaceAllString(html, "")
// Remove event handlers (onclick, onerror, etc.)
eventHandlerRegex := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`)
html = eventHandlerRegex.ReplaceAllString(html, "")
// Remove javascript: protocol in links
javascriptRegex := regexp.MustCompile(`(?i)javascript:`)
html = javascriptRegex.ReplaceAllString(html, "")
// Remove data: URLs that could be dangerous
dataURLRegex := regexp.MustCompile(`(?i)data:\s*text/html`)
html = dataURLRegex.ReplaceAllString(html, "")
return html
}
// extractTOC extracts the table of contents from AsciiDoc HTML output
// Returns the TOC HTML and the content HTML without the TOC
func (p *Processor) extractTOC(html string) (string, string) {
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
// We need to match the entire TOC div including nested content
// Since divs can be nested, we need to count opening/closing tags
var tocContent string
contentWithoutTOC := html
// Find the start of the TOC div - try multiple patterns
tocStartPatterns := []*regexp.Regexp{
// Pattern 1: <div id="toc" class="toc">
regexp.MustCompile(`(?i)<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>`),
// Pattern 2: <div id="toc">
regexp.MustCompile(`(?i)<div\s+id=["']toc["'][^>]*>`),
// Pattern 3: <div class="toc">
regexp.MustCompile(`(?i)<div\s+class=["']toc["'][^>]*>`),
// Pattern 4: <nav id="toc">
regexp.MustCompile(`(?i)<nav\s+id=["']toc["'][^>]*>`),
}
var tocStartIdx int = -1
var tocStartTag string
for _, pattern := range tocStartPatterns {
loc := pattern.FindStringIndex(html)
if loc != nil {
tocStartIdx = loc[0]
tocStartTag = html[loc[0]:loc[1]]
break
}
}
if tocStartIdx == -1 {
// No TOC found
return "", html
}
// Find the matching closing tag by counting div tags
// Start after the opening tag
searchStart := tocStartIdx + len(tocStartTag)
depth := 1
i := searchStart
for i < len(html) && depth > 0 {
// Look for opening or closing div/nav tags
if i+4 < len(html) && html[i:i+4] == "<div" {
// Check if it's a closing tag
if i+5 < len(html) && html[i+4] == '/' {
depth--
// Find the end of this closing tag
closeIdx := strings.Index(html[i:], ">")
if closeIdx == -1 {
break
}
i += closeIdx + 1
} else {
// Opening tag - find the end
closeIdx := strings.Index(html[i:], ">")
if closeIdx == -1 {
break
}
// Check if it's self-closing
if html[i+closeIdx-1] != '/' {
depth++
}
i += closeIdx + 1
}
} else if i+5 < len(html) && html[i:i+5] == "</div" {
depth--
closeIdx := strings.Index(html[i:], ">")
if closeIdx == -1 {
break
}
i += closeIdx + 1
} else if i+5 < len(html) && html[i:i+5] == "</nav" {
depth--
closeIdx := strings.Index(html[i:], ">")
if closeIdx == -1 {
break
}
i += closeIdx + 1
} else {
i++
}
}
if depth == 0 {
// Found the matching closing tag
tocEndIdx := i
// Extract the TOC content (inner HTML)
tocFullHTML := html[tocStartIdx:tocEndIdx]
// Extract just the inner content (without the outer div tags)
innerStart := len(tocStartTag)
innerEnd := len(tocFullHTML)
// Find the last </div> or </nav>
if strings.HasSuffix(tocFullHTML, "</div>") {
innerEnd -= 6
} else if strings.HasSuffix(tocFullHTML, "</nav>") {
innerEnd -= 7
}
tocContent = strings.TrimSpace(tocFullHTML[innerStart:innerEnd])
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
toctitlePattern := regexp.MustCompile(`(?s)<div\s+id=["']toctitle["'][^>]*>.*?</div>\s*`)
tocContent = toctitlePattern.ReplaceAllString(tocContent, "")
tocContent = strings.TrimSpace(tocContent)
// Remove the TOC from the content
contentWithoutTOC = html[:tocStartIdx] + html[tocEndIdx:]
}
return tocContent, contentWithoutTOC
}
// processLinks processes HTML links to add target="_blank" to external links
// External links are those that start with http:// or https:// and don't point to the linkBaseURL domain
// Local links (including relative links and links to linkBaseURL) open in the same tab
func (p *Processor) processLinks(html string) string {
// Extract domain from linkBaseURL for comparison
linkBaseDomain := ""
if strings.HasPrefix(p.linkBaseURL, "http://") || strings.HasPrefix(p.linkBaseURL, "https://") {
// Extract domain (e.g., "alexandria.gitcitadel.eu" from "https://alexandria.gitcitadel.eu")
parts := strings.Split(strings.TrimPrefix(strings.TrimPrefix(p.linkBaseURL, "https://"), "http://"), "/")
if len(parts) > 0 {
linkBaseDomain = parts[0]
}
}
// Regex to match <a> tags with href attributes (more flexible pattern)
linkRegex := regexp.MustCompile(`<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>`)
html = linkRegex.ReplaceAllStringFunc(html, func(match string) string {
// Extract href value
hrefMatch := regexp.MustCompile(`href\s*=\s*["']([^"']+)["']`)
hrefSubmatch := hrefMatch.FindStringSubmatch(match)
if len(hrefSubmatch) < 2 {
return match // No href found, return as-is
}
href := hrefSubmatch[1]
// Check if it's an external link (starts with http:// or https://)
isExternal := strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://")
if isExternal {
// Check if it's pointing to our own domain
if linkBaseDomain != "" && strings.Contains(href, linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute)
targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
match = targetRegex.ReplaceAllString(match, "")
return match
}
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
if !strings.Contains(match, `target=`) {
// Insert before the closing >
match = strings.TrimSuffix(match, ">")
if !strings.Contains(match, `rel=`) {
match += ` target="_blank" rel="noopener noreferrer">`
} else {
// Update existing rel attribute to include noopener if not present
relRegex := regexp.MustCompile(`rel\s*=\s*["']([^"']*)["']`)
match = relRegex.ReplaceAllStringFunc(match, func(relMatch string) string {
relValue := relRegex.FindStringSubmatch(relMatch)[1]
if !strings.Contains(relValue, "noopener") {
relValue += " noopener noreferrer"
}
return `rel="` + strings.TrimSpace(relValue) + `"`
})
match += ` target="_blank">`
}
}
} else {
// Local/relative link - ensure it opens in same tab (remove target if present)
targetRegex := regexp.MustCompile(`\s*target\s*=\s*["'][^"']*["']`)
match = targetRegex.ReplaceAllString(match, "")
}
return match
})
return html
}