|
|
|
|
@ -13,6 +13,12 @@ type Processor struct {
@@ -13,6 +13,12 @@ type Processor struct {
|
|
|
|
|
linkBaseURL string |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// ProcessResult contains the processed HTML content and extracted table of contents
|
|
|
|
|
type ProcessResult struct { |
|
|
|
|
Content string |
|
|
|
|
TableOfContents string |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// NewProcessor creates a new AsciiDoc processor
|
|
|
|
|
func NewProcessor(linkBaseURL string) *Processor { |
|
|
|
|
return &Processor{ |
|
|
|
|
@ -21,23 +27,34 @@ func NewProcessor(linkBaseURL string) *Processor {
@@ -21,23 +27,34 @@ func NewProcessor(linkBaseURL string) *Processor {
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Process converts AsciiDoc content to HTML with link rewriting
|
|
|
|
|
func (p *Processor) Process(asciidocContent string) (string, error) { |
|
|
|
|
// Returns both the content HTML and the extracted table of contents
|
|
|
|
|
func (p *Processor) Process(asciidocContent string) (*ProcessResult, error) { |
|
|
|
|
// First, rewrite links in the AsciiDoc content
|
|
|
|
|
processedContent := p.rewriteLinks(asciidocContent) |
|
|
|
|
|
|
|
|
|
// Convert AsciiDoc to HTML using asciidoctor CLI
|
|
|
|
|
html, err := p.convertToHTML(processedContent) |
|
|
|
|
if err != nil { |
|
|
|
|
return "", fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err) |
|
|
|
|
return nil, fmt.Errorf("failed to convert AsciiDoc to HTML: %w", err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Extract table of contents from HTML
|
|
|
|
|
toc, contentWithoutTOC := p.extractTOC(html) |
|
|
|
|
|
|
|
|
|
// Sanitize HTML to prevent XSS
|
|
|
|
|
sanitized := p.sanitizeHTML(html) |
|
|
|
|
sanitized := p.sanitizeHTML(contentWithoutTOC) |
|
|
|
|
|
|
|
|
|
// Process links: make external links open in new tab, local links in same tab
|
|
|
|
|
processed := p.processLinks(sanitized) |
|
|
|
|
|
|
|
|
|
return processed, nil |
|
|
|
|
// Also sanitize and process links in TOC
|
|
|
|
|
tocSanitized := p.sanitizeHTML(toc) |
|
|
|
|
tocProcessed := p.processLinks(tocSanitized) |
|
|
|
|
|
|
|
|
|
return &ProcessResult{ |
|
|
|
|
Content: processed, |
|
|
|
|
TableOfContents: tocProcessed, |
|
|
|
|
}, nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// rewriteLinks rewrites wikilinks and nostr: links in AsciiDoc content
|
|
|
|
|
@ -189,6 +206,122 @@ func (p *Processor) sanitizeHTML(html string) string {
@@ -189,6 +206,122 @@ func (p *Processor) sanitizeHTML(html string) string {
|
|
|
|
|
return html |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// extractTOC extracts the table of contents from AsciiDoc HTML output
|
|
|
|
|
// Returns the TOC HTML and the content HTML without the TOC
|
|
|
|
|
func (p *Processor) extractTOC(html string) (string, string) { |
|
|
|
|
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
|
|
|
|
|
// We need to match the entire TOC div including nested content
|
|
|
|
|
// Since divs can be nested, we need to count opening/closing tags
|
|
|
|
|
|
|
|
|
|
var tocContent string |
|
|
|
|
contentWithoutTOC := html |
|
|
|
|
|
|
|
|
|
// Find the start of the TOC div - try multiple patterns
|
|
|
|
|
tocStartPatterns := []*regexp.Regexp{ |
|
|
|
|
// Pattern 1: <div id="toc" class="toc">
|
|
|
|
|
regexp.MustCompile(`(?i)<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>`), |
|
|
|
|
// Pattern 2: <div id="toc">
|
|
|
|
|
regexp.MustCompile(`(?i)<div\s+id=["']toc["'][^>]*>`), |
|
|
|
|
// Pattern 3: <div class="toc">
|
|
|
|
|
regexp.MustCompile(`(?i)<div\s+class=["']toc["'][^>]*>`), |
|
|
|
|
// Pattern 4: <nav id="toc">
|
|
|
|
|
regexp.MustCompile(`(?i)<nav\s+id=["']toc["'][^>]*>`), |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
var tocStartIdx int = -1 |
|
|
|
|
var tocStartTag string |
|
|
|
|
|
|
|
|
|
for _, pattern := range tocStartPatterns { |
|
|
|
|
loc := pattern.FindStringIndex(html) |
|
|
|
|
if loc != nil { |
|
|
|
|
tocStartIdx = loc[0] |
|
|
|
|
tocStartTag = html[loc[0]:loc[1]] |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if tocStartIdx == -1 { |
|
|
|
|
// No TOC found
|
|
|
|
|
return "", html |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Find the matching closing tag by counting div tags
|
|
|
|
|
// Start after the opening tag
|
|
|
|
|
searchStart := tocStartIdx + len(tocStartTag) |
|
|
|
|
depth := 1 |
|
|
|
|
i := searchStart |
|
|
|
|
|
|
|
|
|
for i < len(html) && depth > 0 { |
|
|
|
|
// Look for opening or closing div/nav tags
|
|
|
|
|
if i+4 < len(html) && html[i:i+4] == "<div" { |
|
|
|
|
// Check if it's a closing tag
|
|
|
|
|
if i+5 < len(html) && html[i+4] == '/' { |
|
|
|
|
depth-- |
|
|
|
|
// Find the end of this closing tag
|
|
|
|
|
closeIdx := strings.Index(html[i:], ">") |
|
|
|
|
if closeIdx == -1 { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
i += closeIdx + 1 |
|
|
|
|
} else { |
|
|
|
|
// Opening tag - find the end
|
|
|
|
|
closeIdx := strings.Index(html[i:], ">") |
|
|
|
|
if closeIdx == -1 { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
// Check if it's self-closing
|
|
|
|
|
if html[i+closeIdx-1] != '/' { |
|
|
|
|
depth++ |
|
|
|
|
} |
|
|
|
|
i += closeIdx + 1 |
|
|
|
|
} |
|
|
|
|
} else if i+5 < len(html) && html[i:i+5] == "</div" { |
|
|
|
|
depth-- |
|
|
|
|
closeIdx := strings.Index(html[i:], ">") |
|
|
|
|
if closeIdx == -1 { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
i += closeIdx + 1 |
|
|
|
|
} else if i+5 < len(html) && html[i:i+5] == "</nav" { |
|
|
|
|
depth-- |
|
|
|
|
closeIdx := strings.Index(html[i:], ">") |
|
|
|
|
if closeIdx == -1 { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
i += closeIdx + 1 |
|
|
|
|
} else { |
|
|
|
|
i++ |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if depth == 0 { |
|
|
|
|
// Found the matching closing tag
|
|
|
|
|
tocEndIdx := i |
|
|
|
|
// Extract the TOC content (inner HTML)
|
|
|
|
|
tocFullHTML := html[tocStartIdx:tocEndIdx] |
|
|
|
|
// Extract just the inner content (without the outer div tags)
|
|
|
|
|
innerStart := len(tocStartTag) |
|
|
|
|
innerEnd := len(tocFullHTML) |
|
|
|
|
// Find the last </div> or </nav>
|
|
|
|
|
if strings.HasSuffix(tocFullHTML, "</div>") { |
|
|
|
|
innerEnd -= 6 |
|
|
|
|
} else if strings.HasSuffix(tocFullHTML, "</nav>") { |
|
|
|
|
innerEnd -= 7 |
|
|
|
|
} |
|
|
|
|
tocContent = strings.TrimSpace(tocFullHTML[innerStart:innerEnd]) |
|
|
|
|
|
|
|
|
|
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
|
|
|
|
|
toctitlePattern := regexp.MustCompile(`(?s)<div\s+id=["']toctitle["'][^>]*>.*?</div>\s*`) |
|
|
|
|
tocContent = toctitlePattern.ReplaceAllString(tocContent, "") |
|
|
|
|
tocContent = strings.TrimSpace(tocContent) |
|
|
|
|
|
|
|
|
|
// Remove the TOC from the content
|
|
|
|
|
contentWithoutTOC = html[:tocStartIdx] + html[tocEndIdx:] |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return tocContent, contentWithoutTOC |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// processLinks processes HTML links to add target="_blank" to external links
|
|
|
|
|
// External links are those that start with http:// or https:// and don't point to the linkBaseURL domain
|
|
|
|
|
// Local links (including relative links and links to linkBaseURL) open in the same tab
|
|
|
|
|
|