/**
* Comprehensive content parsing service for all Nostr content fields
* Supports AsciiDoc, Advanced Markdown, Basic Markdown, and LaTeX
*/
import { detectMarkupType, getMarkupClasses, MarkupType } from '@/lib/markup-detection'
import { Event, kinds, nip19 } from 'nostr-tools'
import { getImetaInfosFromEvent } from '@/lib/event'
import { URL_REGEX, ExtendedKind } from '@/constants'
import { TImetaInfo } from '@/types'
export interface ParsedContent {
html: string
markupType: MarkupType
cssClasses: string
hasMath: boolean
media: TImetaInfo[]
links: Array<{ url: string; text: string; isExternal: boolean }>
hashtags: string[]
nostrLinks: Array<{ type: 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note'; id: string; text: string }>
highlightSources: Array<{ type: 'event' | 'addressable' | 'url'; value: string; bech32: string }>
}
export interface ParseOptions {
eventKind?: number
field?: 'content' | 'title' | 'summary' | 'description'
maxWidth?: string
enableMath?: boolean
enableSyntaxHighlighting?: boolean
}
class ContentParserService {
private asciidoctor: any = null
private isAsciidoctorLoaded = false
/**
* Initialize AsciiDoctor (lazy loading)
*/
private async loadAsciidoctor() {
if (this.isAsciidoctorLoaded) return this.asciidoctor
try {
const Asciidoctor = await import('@asciidoctor/core')
this.asciidoctor = Asciidoctor.default()
this.isAsciidoctorLoaded = true
return this.asciidoctor
} catch (error) {
console.warn('Failed to load AsciiDoctor:', error)
return null
}
}
/**
* Parse content with appropriate markup processor
*/
async parseContent(
content: string,
options: ParseOptions = {},
event?: Event
): Promise {
const {
eventKind,
enableMath = true,
enableSyntaxHighlighting = true
} = options
// Detect markup type
const markupType = detectMarkupType(content, eventKind)
const cssClasses = getMarkupClasses(markupType)
// Extract all content elements
// For article-type events, don't extract media as it should be rendered inline
const isArticleType = eventKind === kinds.LongFormArticle ||
eventKind === ExtendedKind.WIKI_ARTICLE ||
eventKind === ExtendedKind.PUBLICATION ||
eventKind === ExtendedKind.PUBLICATION_CONTENT
const media = isArticleType ? [] : this.extractAllMedia(content, event)
const links = this.extractLinks(content)
const hashtags = this.extractHashtags(content)
const nostrLinks = this.extractNostrLinks(content)
const highlightSources = event ? this.extractHighlightSources(event) : []
// Check for LaTeX math
const hasMath = enableMath && this.hasMathContent(content)
let html = ''
try {
// Convert everything to AsciiDoc format and process as AsciiDoc
const asciidocContent = this.convertToAsciidoc(content, markupType)
html = await this.parseAsciidoc(asciidocContent, { enableMath, enableSyntaxHighlighting })
} catch (error) {
console.error('Content parsing error:', error)
// Fallback to plain text
html = this.parsePlainText(content)
}
return {
html,
markupType: 'asciidoc',
cssClasses,
hasMath,
media,
links,
hashtags,
nostrLinks,
highlightSources
}
}
/**
* Parse AsciiDoc content
*/
private async parseAsciidoc(content: string, options: { enableMath: boolean; enableSyntaxHighlighting: boolean }): Promise {
const asciidoctor = await this.loadAsciidoctor()
if (!asciidoctor) {
return this.parsePlainText(content)
}
// Check if content starts with level 3+ headers (=== or deeper)
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
// If content starts with level 3+, use book doctype which allows sections at any level
const firstHeaderMatch = content.match(/^(={1,6})\s+/m)
let doctype: 'article' | 'book' = 'article'
if (firstHeaderMatch) {
const firstHeaderLevel = firstHeaderMatch[1].length
// If first header is level 3 or deeper, use book doctype
// Book doctype allows sections at any level without requiring hierarchy
if (firstHeaderLevel >= 3) {
doctype = 'book'
}
}
try {
const result = asciidoctor.convert(content, {
safe: 'safe',
backend: 'html5',
doctype: doctype,
attributes: {
'showtitle': true,
'sectanchors': true,
'sectlinks': true,
'toc': 'left',
'toclevels': 6,
'toc-title': 'Table of Contents',
'source-highlighter': options.enableSyntaxHighlighting ? 'highlight.js' : 'none',
'stem': options.enableMath ? 'latexmath' : 'none',
'data-uri': true,
'imagesdir': '',
'linkcss': false,
'stylesheet': '',
'stylesdir': '',
'prewrap': true,
'sectnums': false,
'sectnumlevels': 6,
'experimental': true,
'compat-mode': false,
'attribute-missing': 'warn',
'attribute-undefined': 'warn',
'skip-front-matter': true,
'source-indent': 0,
'indent': 0,
'tabsize': 2,
'tabwidth': 2,
'hardbreaks': false,
'paragraph-rewrite': 'normal',
'sectids': true,
'idprefix': '',
'idseparator': '-',
'sectidprefix': '',
'sectidseparator': '-'
}
})
const htmlString = typeof result === 'string' ? result : result.toString()
// Debug: log the AsciiDoc HTML output for troubleshooting
if (process.env.NODE_ENV === 'development') {
console.log('AsciiDoc HTML output:', htmlString.substring(0, 1000) + '...')
}
// Process wikilinks in the HTML output
const processedHtml = this.processWikilinksInHtml(htmlString)
// Process images: add max-width styling and prepare for carousel
const imagesProcessedHtml = this.processImagesInHtml(processedHtml)
// Clean up any leftover markdown syntax and hide raw ToC text
const cleanedHtml = this.cleanupMarkdown(imagesProcessedHtml)
// Add proper CSS classes for styling
const styledHtml = this.addStylingClasses(cleanedHtml)
// Hide any raw AsciiDoc ToC text that might appear in the content
return this.hideRawTocText(styledHtml)
} catch (error) {
console.error('AsciiDoc parsing error:', error)
return this.parsePlainText(content)
}
}
/**
* Convert content to AsciiDoc format based on markup type
*/
private convertToAsciidoc(content: string, markupType: string): string {
let asciidoc = ''
switch (markupType) {
case 'asciidoc':
// For AsciiDoc content, ensure proper formatting
// Convert escaped newlines to actual newlines
asciidoc = content.replace(/\\n/g, '\n')
// Ensure headers are on their own lines with proper spacing
// AsciiDoc requires blank lines before headers when they follow other content
// Fix pattern: non-empty line + newline + header without blank line between
asciidoc = asciidoc.replace(/(\S[^\n]*)\n(={1,6}\s+[^\n]+)/g, (_match, before, header) => {
// Add blank line before header if it follows non-empty content
return `${before}\n\n${header}`
})
break
case 'advanced-markdown':
case 'basic-markdown':
asciidoc = this.convertMarkdownToAsciidoc(content)
break
case 'plain-text':
default:
asciidoc = this.convertPlainTextToAsciidoc(content)
break
}
// Process wikilinks for all content types
let result = this.processWikilinks(asciidoc)
// Process nostr: addresses - convert them to proper AsciiDoc format
result = this.processNostrAddresses(result)
// Process hashtags - convert them to proper AsciiDoc format
result = this.processHashtags(result)
// Debug: log the converted AsciiDoc for troubleshooting
if (process.env.NODE_ENV === 'development') {
console.log('Converted AsciiDoc:', result)
}
return result
}
/**
* Convert Markdown to AsciiDoc format
*/
private convertMarkdownToAsciidoc(content: string): string {
// Preprocess: convert escaped newlines to actual newlines
let asciidoc = content.replace(/\\n/g, '\n')
// Preprocess: Fix the specific issue where backticks are used for inline code but not as code blocks
// Look for patterns like `sqlite` (databased) and convert them properly
asciidoc = asciidoc.replace(/`([^`\n]+)`\s*\(([^)]+)\)/g, '`$1` ($2)')
// Fix spacing issues where text runs together
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`([a-zA-Z0-9])/g, '$1 `$2` $3')
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`\s*\(/g, '$1 `$2` (')
asciidoc = asciidoc.replace(/\)`([^`\n]+)`([a-zA-Z0-9])/g, ') `$1` $2')
// Fix specific pattern: text)text -> text) text
asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2')
// Fix specific pattern: text== -> text ==
asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 ==')
// Handle nostr: addresses - preserve them as-is for now, they'll be processed later
// This prevents them from being converted to AsciiDoc link syntax
asciidoc = asciidoc.replace(/nostr:([a-z0-9]+)/g, 'nostr:$1')
// Convert headers - process in order from most specific to least specific
asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======')
asciidoc = asciidoc.replace(/^#{5}\s+(.+)$/gm, '===== $1 =====')
asciidoc = asciidoc.replace(/^#{4}\s+(.+)$/gm, '==== $1 ====')
asciidoc = asciidoc.replace(/^#{3}\s+(.+)$/gm, '=== $1 ===')
asciidoc = asciidoc.replace(/^#{2}\s+(.+)$/gm, '== $1 ==')
asciidoc = asciidoc.replace(/^#{1}\s+(.+)$/gm, '= $1 =')
// Convert markdown-style == headers to AsciiDoc
asciidoc = asciidoc.replace(/^==\s+(.+?)\s+==$/gm, '== $1 ==')
// Also handle inline == headers that might appear in the middle of text
asciidoc = asciidoc.replace(/\s==\s+([^=]+?)\s+==\s/g, ' == $1 == ')
// Convert emphasis - handle both single and double asterisks/underscores
asciidoc = asciidoc.replace(/\*\*(.+?)\*\*/g, '*$1*') // Bold **text**
asciidoc = asciidoc.replace(/__(.+?)__/g, '*$1*') // Bold __text__
asciidoc = asciidoc.replace(/\*(.+?)\*/g, '_$1_') // Italic *text*
asciidoc = asciidoc.replace(/_(.+?)_/g, '_$1_') // Italic _text_
asciidoc = asciidoc.replace(/~~(.+?)~~/g, '[line-through]#$1#') // Strikethrough
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#') // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#') // Superscript
// Convert code blocks - use more precise matching to avoid capturing regular text
asciidoc = asciidoc.replace(/```(\w+)?\n([\s\S]*?)\n```/g, (_match, lang, code) => {
// Ensure we don't capture too much content and it looks like actual code
const trimmedCode = code.trim()
if (trimmedCode.length === 0) return ''
// Check if this looks like actual code (has programming syntax patterns)
const hasCodePatterns = /[{}();=<>]|function|class|import|export|def |if |for |while |return |const |let |var |public |private |static |console\.log|var |let |const |if |for |while |return |function/.test(trimmedCode)
// Additional checks for common non-code patterns
const isLikelyText = /^[A-Za-z\s.,!?\-'"]+$/.test(trimmedCode) && trimmedCode.length > 50
const hasTooManySpaces = (trimmedCode.match(/\s{3,}/g) || []).length > 3
const hasMarkdownPatterns = /^#{1,6}\s|^\*\s|^\d+\.\s|^\>\s|^\|.*\|/.test(trimmedCode)
// If it doesn't look like code, has too many spaces, or looks like markdown, treat as regular text
if ((!hasCodePatterns && trimmedCode.length > 100) || isLikelyText || hasTooManySpaces || hasMarkdownPatterns) {
return _match // Return original markdown
}
return `[source${lang ? ',' + lang : ''}]\n----\n${trimmedCode}\n----`
})
asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`') // Inline code
// Handle LaTeX math in inline code - preserve $...$ syntax
asciidoc = asciidoc.replace(/`\$([^$]+)\$`/g, '`$\\$1\\$$`')
// Convert images - use proper AsciiDoc image syntax
asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, 'image::$2[$1,width=100%]')
// Also handle the specific format: image::url[alt,width=100%] that's already in the content
// This ensures it's properly formatted for AsciiDoc
asciidoc = asciidoc.replace(/image::([^\[]+)\[([^\]]+),width=100%\]/g, 'image::$1[$2,width=100%]')
// Convert links
asciidoc = asciidoc.replace(/\[([^\]]+)\]\(([^)]+)\)/g, 'link:$2[$1]')
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\n---\n')
// Convert unordered lists
asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2')
asciidoc = asciidoc.replace(/^(\s*)-\s+(.+)$/gm, '$1* $2')
asciidoc = asciidoc.replace(/^(\s*)\+\s+(.+)$/gm, '$1* $2')
// Convert ordered lists
asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2')
// Convert blockquotes - handle multiline blockquotes properly with separate attribution
asciidoc = asciidoc.replace(/^(>\s+.+(?:\n>\s+.+)*)/gm, (match) => {
const lines = match.split('\n').map(line => line.replace(/^>\s*/, '')) // Remove '>' and optional space from each line
let quoteBodyLines: string[] = []
let attributionLine: string | undefined
// Find the last line that looks like an attribution (starts with '—' or '--')
for (let i = lines.length - 1; i >= 0; i--) {
const line = lines[i].trim()
if (line.startsWith('—') || line.startsWith('--')) {
attributionLine = line
quoteBodyLines = lines.slice(0, i) // Everything before the attribution is the quote body
break
}
}
const quoteContent = quoteBodyLines.filter(l => l.trim() !== '').join('\n').trim()
if (attributionLine) {
// Remove leading '—' or '--' from the attribution line
let cleanedAttribution = attributionLine.replace(/^[—-]+/, '').trim()
let author = ''
let source = ''
// Try to find a link:url[text] pattern (already converted from markdown links)
// Example: "George Bernard Shaw, link:https://www.goodreads.com/work/quotes/376394[Man and Superman]"
const linkMatch = cleanedAttribution.match(/^(.*?),?\s*link:([^[\\]]+)\[([^\\]]+)\]$/)
if (linkMatch) {
author = linkMatch[1].trim()
// Use the AsciiDoc link format directly in the source attribute
source = `link:${linkMatch[2].trim()}[${linkMatch[3].trim()}]`
} else {
// If no link, assume the whole thing is author or author, sourceText
const parts = cleanedAttribution.split(',').map(p => p.trim())
author = parts[0]
if (parts.length > 1) {
source = parts.slice(1).join(', ').trim()
}
}
// AsciiDoc blockquote with attribution: [quote, author, source]
return `[quote, ${author}, ${source}]\n____\n${quoteContent}\n____`
} else {
// If no attribution line is found, render as a regular AsciiDoc blockquote
return `____\n${quoteContent}\n____`
}
})
// Convert lists
asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2') // Unordered lists
asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2') // Ordered lists
// Convert links
asciidoc = asciidoc.replace(/\[([^\]]+)\]\(([^)]+)\)/g, 'link:$2[$1]')
// Convert images
asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, 'image::$2[$1]')
// Convert tables (basic support) - handle markdown tables properly
asciidoc = asciidoc.replace(/^\|(.+)\|$/gm, (match, content) => {
// Check if this is a table row (not just a single cell)
const cells = content.split('|').map((cell: string) => cell.trim()).filter((cell: string) => cell)
if (cells.length > 1) {
return '|' + content + '|'
}
return match
})
// Fix table rendering by ensuring proper AsciiDoc table format
asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => {
const lines = match.trim().split('\n').filter(line => line.trim())
if (lines.length < 2) return match
const headerRow = lines[0]
const separatorRow = lines[1]
const dataRows = lines.slice(2)
// Check if it's actually a table (has separator row with dashes)
if (!separatorRow.includes('-')) return match
// Convert to proper AsciiDoc table format
let tableAsciidoc = '[cols="1,1"]\n|===\n'
tableAsciidoc += headerRow + '\n'
dataRows.forEach(row => {
tableAsciidoc += row + '\n'
})
tableAsciidoc += '|==='
return tableAsciidoc
})
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\'\'\'')
// Convert footnotes - handle both references and definitions for auto-numbering
const footnoteDefinitions: { [id: string]: string } = {}
let tempAsciidoc = asciidoc
// First, extract all footnote definitions and remove them from the content
// This regex captures [^id]: text including multi-line content
tempAsciidoc = tempAsciidoc.replace(/^\[\^([^\]]+)\]:\s*([\s\S]*?)(?=\n\[\^|\n---|\n##|\n###|\n####|\n#####|\n######|$)/gm, (_, id, text) => {
footnoteDefinitions[id] = text.trim()
return '' // Remove the definition line from the content
})
// Then, replace all footnote references [^id] with AsciiDoc's auto-numbered footnote syntax
// using the extracted definitions.
asciidoc = tempAsciidoc.replace(/\[\^([^\]]+)\]/g, (match, id) => {
if (footnoteDefinitions[id]) {
return `footnote:[${footnoteDefinitions[id]}]`
}
return match // If definition not found, leave as is
})
return asciidoc
}
/**
* Process nostr: addresses in content
*/
private processNostrAddresses(content: string): string {
let processed = content
// Process nostr: addresses - convert them to AsciiDoc link format
// This regex matches nostr: followed by any valid bech32 string
processed = processed.replace(/nostr:([a-z0-9]+[a-z0-9]{6,})/g, (_match, bech32Id) => {
// Create AsciiDoc link with nostr: prefix
return `link:nostr:${bech32Id}[${bech32Id}]`
})
return processed
}
/**
* Process hashtags in content
*/
private processHashtags(content: string): string {
let processed = content
// Convert hashtags to AsciiDoc link format: #hashtag -> hashtag:tag[#tag]
// This regex matches # followed by word characters, avoiding those in URLs, code blocks, etc.
// Using word boundary approach to avoid matching # in URLs
processed = processed.replace(/\B#([a-zA-Z0-9_]+)/g, (_match, hashtag) => {
// Normalize hashtag to lowercase for consistency
const normalizedHashtag = hashtag.toLowerCase()
return `hashtag:${normalizedHashtag}[#${hashtag}]`
})
return processed
}
/**
* Process wikilinks in content (both standard and bookstr macro)
*/
private processWikilinks(content: string): string {
let processed = content
// Process bookstr macro wikilinks: [[book:...]] where ... can be any book type and reference
processed = processed.replace(/\[\[book:([^\]]+)\]\]/g, (_match, bookContent) => {
const cleanContent = bookContent.trim()
const dTag = this.normalizeDtag(cleanContent)
return `wikilink:${dTag}[${cleanContent}]`
})
// Process standard wikilinks: [[Target Page]] or [[target page|see this]]
processed = processed.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, displayText) => {
const cleanTarget = target.trim()
const cleanDisplay = displayText ? displayText.trim() : cleanTarget
const dTag = this.normalizeDtag(cleanTarget)
return `wikilink:${dTag}[${cleanDisplay}]`
})
return processed
}
/**
* Normalize text to d-tag format (lowercase, non-letters to dashes)
*/
private normalizeDtag(text: string): string {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
}
/**
* Process wikilinks and nostr links in HTML output
*/
private processWikilinksInHtml(html: string): string {
let processed = html
// Convert hashtag links to HTML with green styling
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
return `${displayText}`
})
// Convert wikilink:dtag[display] format to HTML with data attributes
processed = processed.replace(/wikilink:([^[]+)\[([^\]]+)\]/g, (_match, dTag, displayText) => {
return `${displayText}`
})
// Convert nostr: links to proper embedded components
processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => {
const nostrType = this.getNostrType(bech32Id)
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') {
// Render as embedded event
return `Loading embedded event...
`
} else if (nostrType === 'npub' || nostrType === 'nprofile') {
// Render as user handle
return `@${displayText}`
} else {
// Fallback to regular link
return `${displayText}`
}
})
return processed
}
/**
* Process images in HTML output: add max-width styling and data attributes for carousel
*/
private processImagesInHtml(html: string): string {
let processed = html
// Extract all image URLs for carousel
const imageUrls: string[] = []
const imageUrlRegex = /
]+src=["']([^"']+)["'][^>]*>/gi
let match
while ((match = imageUrlRegex.exec(html)) !== null) {
const url = match[1]
if (url && !imageUrls.includes(url)) {
imageUrls.push(url)
}
}
// Process each img tag: add max-width styling and data attributes
processed = processed.replace(/
]+)>/gi, (imgTag, attributes) => {
// Extract src attribute
const srcMatch = attributes.match(/src=["']([^"']+)["']/i)
if (!srcMatch) return imgTag
const src = srcMatch[1]
const currentIndex = imageUrls.indexOf(src)
// Add/update class for max-width
let updatedAttributes = attributes
if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => {
// Remove existing max-w classes and add our max-w-[400px]
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim()
const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
: 'max-w-[400px] object-contain cursor-zoom-in'
return `class="${newClasses}"`
})
} else {
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`
}
// Add data attributes for carousel
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '"')}"`
return `
`
})
return processed
}
/**
* Convert plain text to AsciiDoc format
*/
private convertPlainTextToAsciidoc(content: string): string {
// Convert line breaks to AsciiDoc format
return content
.replace(/\n\n/g, '\n\n')
.replace(/\n/g, ' +\n')
}
/**
* Parse plain text content
*/
private parsePlainText(content: string): string {
// Convert line breaks to HTML
return content
.replace(/\n\n/g, '
')
.replace(/\n/g, '
')
.replace(/^/, '
')
.replace(/$/, '
')
}
/**
* Clean up leftover markdown syntax after AsciiDoc processing
*/
private cleanupMarkdown(html: string): string {
let cleaned = html
// Clean up markdown image syntax: 
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const altText = alt || ''
return `
`
})
// Clean up markdown link syntax: [text](url)
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
// Check if it's already an HTML link
if (cleaned.includes(`href="${url}"`)) {
return _match
}
return `${text} `
})
// Fix broken HTML attributes that are being rendered as text
cleaned = cleaned.replace(/" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">([^<]+)