You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1133 lines
40 KiB

/**
* Comprehensive content parsing service for all Nostr content fields
* Supports AsciiDoc, Advanced Markdown, Basic Markdown, and LaTeX
*/
import { detectMarkupType, getMarkupClasses, MarkupType } from '@/lib/markup-detection'
import { Event, kinds, nip19 } from 'nostr-tools'
import { getImetaInfosFromEvent } from '@/lib/event'
import { URL_REGEX, ExtendedKind } from '@/constants'
import { TImetaInfo } from '@/types'
export interface ParsedContent {
html: string
markupType: MarkupType
cssClasses: string
hasMath: boolean
media: TImetaInfo[]
links: Array<{ url: string; text: string; isExternal: boolean }>
hashtags: string[]
nostrLinks: Array<{ type: 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note'; id: string; text: string }>
highlightSources: Array<{ type: 'event' | 'addressable' | 'url'; value: string; bech32: string }>
}
export interface ParseOptions {
eventKind?: number
field?: 'content' | 'title' | 'summary' | 'description'
maxWidth?: string
enableMath?: boolean
enableSyntaxHighlighting?: boolean
}
class ContentParserService {
private asciidoctor: any = null
private isAsciidoctorLoaded = false
/**
* Initialize AsciiDoctor (lazy loading)
*/
private async loadAsciidoctor() {
if (this.isAsciidoctorLoaded) return this.asciidoctor
try {
const Asciidoctor = await import('@asciidoctor/core')
this.asciidoctor = Asciidoctor.default()
this.isAsciidoctorLoaded = true
return this.asciidoctor
} catch (error) {
console.warn('Failed to load AsciiDoctor:', error)
return null
}
}
/**
* Parse content with appropriate markup processor
*/
async parseContent(
content: string,
options: ParseOptions = {},
event?: Event
): Promise<ParsedContent> {
const {
eventKind,
enableMath = true,
enableSyntaxHighlighting = true
} = options
// Detect markup type
const markupType = detectMarkupType(content, eventKind)
const cssClasses = getMarkupClasses(markupType)
// Extract all content elements
// For article-type events, don't extract media as it should be rendered inline
const isArticleType = eventKind === kinds.LongFormArticle ||
eventKind === ExtendedKind.WIKI_ARTICLE ||
eventKind === ExtendedKind.PUBLICATION ||
eventKind === ExtendedKind.PUBLICATION_CONTENT
const media = isArticleType ? [] : this.extractAllMedia(content, event)
const links = this.extractLinks(content)
const hashtags = this.extractHashtags(content)
const nostrLinks = this.extractNostrLinks(content)
const highlightSources = event ? this.extractHighlightSources(event) : []
// Check for LaTeX math
const hasMath = enableMath && this.hasMathContent(content)
let html = ''
try {
// Convert everything to AsciiDoc format and process as AsciiDoc
const asciidocContent = this.convertToAsciidoc(content, markupType)
html = await this.parseAsciidoc(asciidocContent, { enableMath, enableSyntaxHighlighting })
} catch (error) {
console.error('Content parsing error:', error)
// Fallback to plain text
html = this.parsePlainText(content)
}
return {
html,
markupType: 'asciidoc',
cssClasses,
hasMath,
media,
links,
hashtags,
nostrLinks,
highlightSources
}
}
/**
* Parse AsciiDoc content
*/
private async parseAsciidoc(content: string, options: { enableMath: boolean; enableSyntaxHighlighting: boolean }): Promise<string> {
const asciidoctor = await this.loadAsciidoctor()
if (!asciidoctor) {
return this.parsePlainText(content)
}
// Check if content starts with level 3+ headers (=== or deeper)
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
// If content starts with level 3+, use book doctype which allows sections at any level
const firstHeaderMatch = content.match(/^(={1,6})\s+/m)
let doctype: 'article' | 'book' = 'article'
if (firstHeaderMatch) {
const firstHeaderLevel = firstHeaderMatch[1].length
// If first header is level 3 or deeper, use book doctype
// Book doctype allows sections at any level without requiring hierarchy
if (firstHeaderLevel >= 3) {
doctype = 'book'
}
}
try {
const result = asciidoctor.convert(content, {
safe: 'safe',
backend: 'html5',
doctype: doctype,
attributes: {
'showtitle': true,
'sectanchors': true,
'sectlinks': true,
'toc': 'left',
'toclevels': 6,
'toc-title': 'Table of Contents',
'source-highlighter': options.enableSyntaxHighlighting ? 'highlight.js' : 'none',
'stem': options.enableMath ? 'latexmath' : 'none',
'data-uri': true,
'imagesdir': '',
'linkcss': false,
'stylesheet': '',
'stylesdir': '',
'prewrap': true,
'sectnums': false,
'sectnumlevels': 6,
'experimental': true,
'compat-mode': false,
'attribute-missing': 'warn',
'attribute-undefined': 'warn',
'skip-front-matter': true,
'source-indent': 0,
'indent': 0,
'tabsize': 2,
'tabwidth': 2,
'hardbreaks': false,
'paragraph-rewrite': 'normal',
'sectids': true,
'idprefix': '',
'idseparator': '-',
'sectidprefix': '',
'sectidseparator': '-'
}
})
const htmlString = typeof result === 'string' ? result : result.toString()
// Debug: log the AsciiDoc HTML output for troubleshooting
if (process.env.NODE_ENV === 'development') {
console.log('AsciiDoc HTML output:', htmlString.substring(0, 1000) + '...')
}
// Process wikilinks in the HTML output
const processedHtml = this.processWikilinksInHtml(htmlString)
// Process images: add max-width styling and prepare for carousel
const imagesProcessedHtml = this.processImagesInHtml(processedHtml)
// Clean up any leftover markdown syntax and hide raw ToC text
const cleanedHtml = this.cleanupMarkdown(imagesProcessedHtml)
// Add proper CSS classes for styling
const styledHtml = this.addStylingClasses(cleanedHtml)
// Hide any raw AsciiDoc ToC text that might appear in the content
return this.hideRawTocText(styledHtml)
} catch (error) {
console.error('AsciiDoc parsing error:', error)
return this.parsePlainText(content)
}
}
/**
* Convert content to AsciiDoc format based on markup type
*/
private convertToAsciidoc(content: string, markupType: string): string {
let asciidoc = ''
switch (markupType) {
case 'asciidoc':
// For AsciiDoc content, ensure proper formatting
// Convert escaped newlines to actual newlines
asciidoc = content.replace(/\\n/g, '\n')
// Ensure headers are on their own lines with proper spacing
// AsciiDoc requires blank lines before headers when they follow other content
// Fix pattern: non-empty line + newline + header without blank line between
asciidoc = asciidoc.replace(/(\S[^\n]*)\n(={1,6}\s+[^\n]+)/g, (_match, before, header) => {
// Add blank line before header if it follows non-empty content
return `${before}\n\n${header}`
})
break
case 'advanced-markdown':
case 'basic-markdown':
asciidoc = this.convertMarkdownToAsciidoc(content)
break
case 'plain-text':
default:
asciidoc = this.convertPlainTextToAsciidoc(content)
break
}
// Process wikilinks for all content types
let result = this.processWikilinks(asciidoc)
// Process nostr: addresses - convert them to proper AsciiDoc format
result = this.processNostrAddresses(result)
// Process hashtags - convert them to proper AsciiDoc format
result = this.processHashtags(result)
// Debug: log the converted AsciiDoc for troubleshooting
if (process.env.NODE_ENV === 'development') {
console.log('Converted AsciiDoc:', result)
}
return result
}
/**
* Convert Markdown to AsciiDoc format
*/
private convertMarkdownToAsciidoc(content: string): string {
// Preprocess: convert escaped newlines to actual newlines
let asciidoc = content.replace(/\\n/g, '\n')
// Preprocess: Fix the specific issue where backticks are used for inline code but not as code blocks
// Look for patterns like `sqlite` (databased) and convert them properly
asciidoc = asciidoc.replace(/`([^`\n]+)`\s*\(([^)]+)\)/g, '`$1` ($2)')
// Fix spacing issues where text runs together
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`([a-zA-Z0-9])/g, '$1 `$2` $3')
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`\s*\(/g, '$1 `$2` (')
asciidoc = asciidoc.replace(/\)`([^`\n]+)`([a-zA-Z0-9])/g, ') `$1` $2')
// Fix specific pattern: text)text -> text) text
asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2')
// Fix specific pattern: text== -> text ==
asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 ==')
// Handle nostr: addresses - preserve them as-is for now, they'll be processed later
// This prevents them from being converted to AsciiDoc link syntax
asciidoc = asciidoc.replace(/nostr:([a-z0-9]+)/g, 'nostr:$1')
// Convert headers - process in order from most specific to least specific
asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======')
asciidoc = asciidoc.replace(/^#{5}\s+(.+)$/gm, '===== $1 =====')
asciidoc = asciidoc.replace(/^#{4}\s+(.+)$/gm, '==== $1 ====')
asciidoc = asciidoc.replace(/^#{3}\s+(.+)$/gm, '=== $1 ===')
asciidoc = asciidoc.replace(/^#{2}\s+(.+)$/gm, '== $1 ==')
asciidoc = asciidoc.replace(/^#{1}\s+(.+)$/gm, '= $1 =')
// Convert markdown-style == headers to AsciiDoc
asciidoc = asciidoc.replace(/^==\s+(.+?)\s+==$/gm, '== $1 ==')
// Also handle inline == headers that might appear in the middle of text
asciidoc = asciidoc.replace(/\s==\s+([^=]+?)\s+==\s/g, ' == $1 == ')
// Convert emphasis - handle both single and double asterisks/underscores
asciidoc = asciidoc.replace(/\*\*(.+?)\*\*/g, '*$1*') // Bold **text**
asciidoc = asciidoc.replace(/__(.+?)__/g, '*$1*') // Bold __text__
asciidoc = asciidoc.replace(/\*(.+?)\*/g, '_$1_') // Italic *text*
asciidoc = asciidoc.replace(/_(.+?)_/g, '_$1_') // Italic _text_
asciidoc = asciidoc.replace(/~~(.+?)~~/g, '[line-through]#$1#') // Strikethrough
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#') // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#') // Superscript
// Convert code blocks - use more precise matching to avoid capturing regular text
asciidoc = asciidoc.replace(/```(\w+)?\n([\s\S]*?)\n```/g, (_match, lang, code) => {
// Ensure we don't capture too much content and it looks like actual code
const trimmedCode = code.trim()
if (trimmedCode.length === 0) return ''
// Check if this looks like actual code (has programming syntax patterns)
const hasCodePatterns = /[{}();=<>]|function|class|import|export|def |if |for |while |return |const |let |var |public |private |static |console\.log|var |let |const |if |for |while |return |function/.test(trimmedCode)
// Additional checks for common non-code patterns
const isLikelyText = /^[A-Za-z\s.,!?\-'"]+$/.test(trimmedCode) && trimmedCode.length > 50
const hasTooManySpaces = (trimmedCode.match(/\s{3,}/g) || []).length > 3
const hasMarkdownPatterns = /^#{1,6}\s|^\*\s|^\d+\.\s|^\>\s|^\|.*\|/.test(trimmedCode)
// If it doesn't look like code, has too many spaces, or looks like markdown, treat as regular text
if ((!hasCodePatterns && trimmedCode.length > 100) || isLikelyText || hasTooManySpaces || hasMarkdownPatterns) {
return _match // Return original markdown
}
return `[source${lang ? ',' + lang : ''}]\n----\n${trimmedCode}\n----`
})
asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`') // Inline code
// Handle LaTeX math in inline code - preserve $...$ syntax
asciidoc = asciidoc.replace(/`\$([^$]+)\$`/g, '`$\\$1\\$$`')
// Convert images - use proper AsciiDoc image syntax
asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, 'image::$2[$1,width=100%]')
// Also handle the specific format: image::url[alt,width=100%] that's already in the content
// This ensures it's properly formatted for AsciiDoc
asciidoc = asciidoc.replace(/image::([^\[]+)\[([^\]]+),width=100%\]/g, 'image::$1[$2,width=100%]')
// Convert links
asciidoc = asciidoc.replace(/\[([^\]]+)\]\(([^)]+)\)/g, 'link:$2[$1]')
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\n---\n')
// Convert unordered lists
asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2')
asciidoc = asciidoc.replace(/^(\s*)-\s+(.+)$/gm, '$1* $2')
asciidoc = asciidoc.replace(/^(\s*)\+\s+(.+)$/gm, '$1* $2')
// Convert ordered lists
asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2')
// Convert blockquotes - handle multiline blockquotes properly with separate attribution
asciidoc = asciidoc.replace(/^(>\s+.+(?:\n>\s+.+)*)/gm, (match) => {
const lines = match.split('\n').map(line => line.replace(/^>\s*/, '')) // Remove '>' and optional space from each line
let quoteBodyLines: string[] = []
let attributionLine: string | undefined
// Find the last line that looks like an attribution (starts with '—' or '--')
for (let i = lines.length - 1; i >= 0; i--) {
const line = lines[i].trim()
if (line.startsWith('—') || line.startsWith('--')) {
attributionLine = line
quoteBodyLines = lines.slice(0, i) // Everything before the attribution is the quote body
break
}
}
const quoteContent = quoteBodyLines.filter(l => l.trim() !== '').join('\n').trim()
if (attributionLine) {
// Remove leading '—' or '--' from the attribution line
let cleanedAttribution = attributionLine.replace(/^[—-]+/, '').trim()
let author = ''
let source = ''
// Try to find a link:url[text] pattern (already converted from markdown links)
// Example: "George Bernard Shaw, link:https://www.goodreads.com/work/quotes/376394[Man and Superman]"
const linkMatch = cleanedAttribution.match(/^(.*?),?\s*link:([^[\\]]+)\[([^\\]]+)\]$/)
if (linkMatch) {
author = linkMatch[1].trim()
// Use the AsciiDoc link format directly in the source attribute
source = `link:${linkMatch[2].trim()}[${linkMatch[3].trim()}]`
} else {
// If no link, assume the whole thing is author or author, sourceText
const parts = cleanedAttribution.split(',').map(p => p.trim())
author = parts[0]
if (parts.length > 1) {
source = parts.slice(1).join(', ').trim()
}
}
// AsciiDoc blockquote with attribution: [quote, author, source]
return `[quote, ${author}, ${source}]\n____\n${quoteContent}\n____`
} else {
// If no attribution line is found, render as a regular AsciiDoc blockquote
return `____\n${quoteContent}\n____`
}
})
// Convert lists
asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2') // Unordered lists
asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2') // Ordered lists
// Convert links
asciidoc = asciidoc.replace(/\[([^\]]+)\]\(([^)]+)\)/g, 'link:$2[$1]')
// Convert images
asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, 'image::$2[$1]')
// Convert tables (basic support) - handle markdown tables properly
asciidoc = asciidoc.replace(/^\|(.+)\|$/gm, (match, content) => {
// Check if this is a table row (not just a single cell)
const cells = content.split('|').map((cell: string) => cell.trim()).filter((cell: string) => cell)
if (cells.length > 1) {
return '|' + content + '|'
}
return match
})
// Fix table rendering by ensuring proper AsciiDoc table format
asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => {
const lines = match.trim().split('\n').filter(line => line.trim())
if (lines.length < 2) return match
const headerRow = lines[0]
const separatorRow = lines[1]
const dataRows = lines.slice(2)
// Check if it's actually a table (has separator row with dashes)
if (!separatorRow.includes('-')) return match
// Convert to proper AsciiDoc table format
let tableAsciidoc = '[cols="1,1"]\n|===\n'
tableAsciidoc += headerRow + '\n'
dataRows.forEach(row => {
tableAsciidoc += row + '\n'
})
tableAsciidoc += '|==='
return tableAsciidoc
})
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\'\'\'')
// Convert footnotes - handle both references and definitions for auto-numbering
const footnoteDefinitions: { [id: string]: string } = {}
let tempAsciidoc = asciidoc
// First, extract all footnote definitions and remove them from the content
// This regex captures [^id]: text including multi-line content
tempAsciidoc = tempAsciidoc.replace(/^\[\^([^\]]+)\]:\s*([\s\S]*?)(?=\n\[\^|\n---|\n##|\n###|\n####|\n#####|\n######|$)/gm, (_, id, text) => {
footnoteDefinitions[id] = text.trim()
return '' // Remove the definition line from the content
})
// Then, replace all footnote references [^id] with AsciiDoc's auto-numbered footnote syntax
// using the extracted definitions.
asciidoc = tempAsciidoc.replace(/\[\^([^\]]+)\]/g, (match, id) => {
if (footnoteDefinitions[id]) {
return `footnote:[${footnoteDefinitions[id]}]`
}
return match // If definition not found, leave as is
})
return asciidoc
}
/**
* Process nostr: addresses in content
*/
private processNostrAddresses(content: string): string {
let processed = content
// Process nostr: addresses - convert them to AsciiDoc link format
// This regex matches nostr: followed by any valid bech32 string
processed = processed.replace(/nostr:([a-z0-9]+[a-z0-9]{6,})/g, (_match, bech32Id) => {
// Create AsciiDoc link with nostr: prefix
return `link:nostr:${bech32Id}[${bech32Id}]`
})
return processed
}
/**
* Process hashtags in content
*/
private processHashtags(content: string): string {
let processed = content
// Convert hashtags to AsciiDoc link format: #hashtag -> hashtag:tag[#tag]
// This regex matches # followed by word characters, avoiding those in URLs, code blocks, etc.
// Using word boundary approach to avoid matching # in URLs
processed = processed.replace(/\B#([a-zA-Z0-9_]+)/g, (_match, hashtag) => {
// Normalize hashtag to lowercase for consistency
const normalizedHashtag = hashtag.toLowerCase()
return `hashtag:${normalizedHashtag}[#${hashtag}]`
})
return processed
}
/**
* Process wikilinks in content (both standard and bookstr macro)
*/
private processWikilinks(content: string): string {
let processed = content
// Process bookstr macro wikilinks: [[book:...]] where ... can be any book type and reference
processed = processed.replace(/\[\[book:([^\]]+)\]\]/g, (_match, bookContent) => {
const cleanContent = bookContent.trim()
const dTag = this.normalizeDtag(cleanContent)
return `wikilink:${dTag}[${cleanContent}]`
})
// Process standard wikilinks: [[Target Page]] or [[target page|see this]]
processed = processed.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, displayText) => {
const cleanTarget = target.trim()
const cleanDisplay = displayText ? displayText.trim() : cleanTarget
const dTag = this.normalizeDtag(cleanTarget)
return `wikilink:${dTag}[${cleanDisplay}]`
})
return processed
}
/**
* Normalize text to d-tag format (lowercase, non-letters to dashes)
*/
private normalizeDtag(text: string): string {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
}
/**
* Process wikilinks and nostr links in HTML output
*/
private processWikilinksInHtml(html: string): string {
let processed = html
// Convert hashtag links to HTML with green styling
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
return `<a href="/notes?t=${normalizedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${displayText}</a>`
})
// Convert wikilink:dtag[display] format to HTML with data attributes
processed = processed.replace(/wikilink:([^[]+)\[([^\]]+)\]/g, (_match, dTag, displayText) => {
return `<span class="wikilink cursor-pointer text-blue-600 hover:text-blue-800 hover:underline border-b border-dotted border-blue-300" data-dtag="${dTag}" data-display="${displayText}">${displayText}</span>`
})
// Convert nostr: links to proper embedded components
processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => {
const nostrType = this.getNostrType(bech32Id)
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') {
// Render as embedded event
return `<div data-embedded-note="${bech32Id}" class="embedded-note-container">Loading embedded event...</div>`
} else if (nostrType === 'npub' || nostrType === 'nprofile') {
// Render as user handle
return `<span class="user-handle" data-pubkey="${bech32Id}">@${displayText}</span>`
} else {
// Fallback to regular link
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType}" data-bech32="${bech32Id}">${displayText}</a>`
}
})
return processed
}
/**
* Process images in HTML output: add max-width styling and data attributes for carousel
*/
private processImagesInHtml(html: string): string {
let processed = html
// Extract all image URLs for carousel
const imageUrls: string[] = []
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi
let match
while ((match = imageUrlRegex.exec(html)) !== null) {
const url = match[1]
if (url && !imageUrls.includes(url)) {
imageUrls.push(url)
}
}
// Process each img tag: add max-width styling and data attributes
processed = processed.replace(/<img([^>]+)>/gi, (imgTag, attributes) => {
// Extract src attribute
const srcMatch = attributes.match(/src=["']([^"']+)["']/i)
if (!srcMatch) return imgTag
const src = srcMatch[1]
const currentIndex = imageUrls.indexOf(src)
// Add/update class for max-width
let updatedAttributes = attributes
if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => {
// Remove existing max-w classes and add our max-w-[400px]
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim()
const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
: 'max-w-[400px] object-contain cursor-zoom-in'
return `class="${newClasses}"`
})
} else {
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`
}
// Add data attributes for carousel
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '&quot;')}"`
return `<img${updatedAttributes}>`
})
return processed
}
/**
* Convert plain text to AsciiDoc format
*/
private convertPlainTextToAsciidoc(content: string): string {
// Convert line breaks to AsciiDoc format
return content
.replace(/\n\n/g, '\n\n')
.replace(/\n/g, ' +\n')
}
/**
* Parse plain text content
*/
private parsePlainText(content: string): string {
// Convert line breaks to HTML
return content
.replace(/\n\n/g, '</p><p>')
.replace(/\n/g, '<br>')
.replace(/^/, '<p>')
.replace(/$/, '</p>')
}
/**
* Clean up leftover markdown syntax after AsciiDoc processing
*/
private cleanupMarkdown(html: string): string {
let cleaned = html
// Clean up markdown image syntax: ![alt](url)
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const altText = alt || ''
return `<img src="${url}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`
})
// Clean up markdown link syntax: [text](url)
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
// Check if it's already an HTML link
if (cleaned.includes(`href="${url}"`)) {
return _match
}
return `<a href="${url}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${text} <svg class="size-3" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`
})
// Fix broken HTML attributes that are being rendered as text
cleaned = cleaned.replace(/" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">([^<]+) <svg[^>]*><path[^>]*><\/path><\/svg><\/a>/g, (_match, text) => {
return `" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${text} <svg class="size-3" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`
})
// Fix broken image HTML
cleaned = cleaned.replace(/" alt="([^"]*)" class="max-w-\[400px\] object-contain my-0" \/>/g, (_match, alt) => {
return `" alt="${alt}" class="max-w-[400px] object-contain my-0" />`
})
// Clean up markdown table syntax
cleaned = this.cleanupMarkdownTables(cleaned)
return cleaned
}
/**
* Clean up markdown tables
*/
private cleanupMarkdownTables(html: string): string {
const tableRegex = /(\|.*\|[\r\n]+\|[\s\-\|]*[\r\n]+(\|.*\|[\r\n]+)*)/g
return html.replace(tableRegex, (match) => {
const lines = match.trim().split('\n').filter(line => line.trim())
if (lines.length < 2) return match
const headerRow = lines[0]
const separatorRow = lines[1]
const dataRows = lines.slice(2)
// Check if it's actually a table (has separator row with dashes)
if (!separatorRow.includes('-')) return match
const headers = headerRow.split('|').map(cell => cell.trim()).filter(cell => cell)
const rows = dataRows.map(row =>
row.split('|').map(cell => cell.trim()).filter(cell => cell)
)
let tableHtml = '<table class="min-w-full border-collapse border border-gray-300 my-4">\n'
// Header
tableHtml += ' <thead>\n <tr>\n'
headers.forEach(header => {
tableHtml += ` <th class="border border-gray-300 px-4 py-2 bg-gray-50 font-semibold text-left">${header}</th>\n`
})
tableHtml += ' </tr>\n </thead>\n'
// Body
tableHtml += ' <tbody>\n'
rows.forEach(row => {
tableHtml += ' <tr>\n'
row.forEach((cell, index) => {
const tag = index < headers.length ? 'td' : 'td'
tableHtml += ` <${tag} class="border border-gray-300 px-4 py-2">${cell}</${tag}>\n`
})
tableHtml += ' </tr>\n'
})
tableHtml += ' </tbody>\n'
tableHtml += '</table>'
return tableHtml
})
}
/**
* Extract all media from content and event
*/
private extractAllMedia(content: string, event?: Event): TImetaInfo[] {
const media: TImetaInfo[] = []
const seenUrls = new Set<string>()
// 1. Extract from imeta tags if event is provided
if (event) {
const imetaMedia = getImetaInfosFromEvent(event)
imetaMedia.forEach(item => {
if (!seenUrls.has(item.url)) {
media.push(item)
seenUrls.add(item.url)
}
})
}
// 2. Extract from markdown images: ![alt](url)
const imageMatches = content.match(/!\[[^\]]*\]\(([^)]+)\)/g) || []
imageMatches.forEach(match => {
const url = match.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1]
if (url && !seenUrls.has(url)) {
const isVideo = /\.(mp4|webm|ogg)$/i.test(url)
media.push({
url,
pubkey: event?.pubkey || '',
m: isVideo ? 'video/*' : 'image/*'
})
seenUrls.add(url)
}
})
// 3. Extract from asciidoc images: image::url[alt,width]
const asciidocImageMatches = content.match(/image::([^\[]+)\[/g) || []
asciidocImageMatches.forEach(match => {
const url = match.match(/image::([^\[]+)\[/)?.[1]
if (url && !seenUrls.has(url)) {
const isVideo = /\.(mp4|webm|ogg)$/i.test(url)
media.push({
url,
pubkey: event?.pubkey || '',
m: isVideo ? 'video/*' : 'image/*'
})
seenUrls.add(url)
}
})
// 4. Extract raw URLs from content
const rawUrls = content.match(URL_REGEX) || []
rawUrls.forEach(url => {
if (!seenUrls.has(url)) {
const isImage = /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url)
const isVideo = /\.(mp4|webm|ogg)$/i.test(url)
if (isImage || isVideo) {
media.push({
url,
pubkey: event?.pubkey || '',
m: isVideo ? 'video/*' : 'image/*'
})
seenUrls.add(url)
}
}
})
return media
}
/**
* Extract all links from content
*/
private extractLinks(content: string): Array<{ url: string; text: string; isExternal: boolean }> {
const links: Array<{ url: string; text: string; isExternal: boolean }> = []
const seenUrls = new Set<string>()
// Extract markdown links: [text](url)
const markdownLinks = content.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []
markdownLinks.forEach(_match => {
const linkMatch = _match.match(/\[([^\]]+)\]\(([^)]+)\)/)
if (linkMatch) {
const [, text, url] = linkMatch
if (!seenUrls.has(url)) {
links.push({
url,
text,
isExternal: this.isExternalUrl(url)
})
seenUrls.add(url)
}
}
})
// Extract asciidoc links: link:url[text]
const asciidocLinks = content.match(/link:([^\[]+)\[([^\]]+)\]/g) || []
asciidocLinks.forEach(_match => {
const linkMatch = _match.match(/link:([^\[]+)\[([^\]]+)\]/)
if (linkMatch) {
const [, url, text] = linkMatch
if (!seenUrls.has(url)) {
links.push({
url,
text,
isExternal: this.isExternalUrl(url)
})
seenUrls.add(url)
}
}
})
// Extract raw URLs
const rawUrls = content.match(URL_REGEX) || []
rawUrls.forEach(url => {
if (!seenUrls.has(url) && !this.isNostrUrl(url)) {
links.push({
url,
text: url,
isExternal: this.isExternalUrl(url)
})
seenUrls.add(url)
}
})
return links
}
/**
* Extract hashtags from content
*/
private extractHashtags(content: string): string[] {
const hashtags: string[] = []
const seenTags = new Set<string>()
// Extract hashtags: #hashtag
const hashtagMatches = content.match(/#([a-zA-Z0-9_]+)/g) || []
hashtagMatches.forEach(_match => {
const tag = _match.substring(1) // Remove #
if (!seenTags.has(tag)) {
hashtags.push(tag)
seenTags.add(tag)
}
})
return hashtags
}
/**
* Extract Nostr links from content
*/
private extractNostrLinks(content: string): Array<{ type: 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note'; id: string; text: string }> {
const nostrLinks: Array<{ type: 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note'; id: string; text: string }> = []
// Extract nostr: prefixed links
const nostrMatches = content.match(/nostr:([a-z0-9]+[a-z0-9]{6,})/g) || []
nostrMatches.forEach(_match => {
const id = _match.substring(6) // Remove 'nostr:'
const type = this.getNostrType(id)
if (type) {
nostrLinks.push({
type,
id,
text: _match
})
}
})
// Extract raw nostr identifiers
const rawNostrMatches = content.match(/([a-z0-9]+[a-z0-9]{6,})/g) || []
rawNostrMatches.forEach(_match => {
const type = this.getNostrType(_match)
if (type && !nostrLinks.some(link => link.id === _match)) {
nostrLinks.push({
type,
id: _match,
text: _match
})
}
})
return nostrLinks
}
/**
* Check if URL is external
*/
private isExternalUrl(url: string): boolean {
try {
const urlObj = new URL(url)
return urlObj.hostname !== window.location.hostname
} catch {
return true
}
}
/**
* Check if URL is a Nostr URL
*/
private isNostrUrl(url: string): boolean {
return url.startsWith('nostr:') || this.getNostrType(url) !== null
}
/**
* Extract highlight sources from event tags
*/
private extractHighlightSources(event: Event): Array<{ type: 'event' | 'addressable' | 'url'; value: string; bech32: string }> {
const sources: Array<{ type: 'event' | 'addressable' | 'url'; value: string; bech32: string }> = []
// Check for 'source' marker first (highest priority)
let sourceTag: string[] | undefined
for (const tag of event.tags) {
if (tag[2] === 'source' || tag[3] === 'source') {
sourceTag = tag
break
}
}
// If no 'source' marker found, process tags in priority order: e > a > r
if (!sourceTag) {
for (const tag of event.tags) {
// Give 'e' tags highest priority
if (tag[0] === 'e') {
sourceTag = tag
continue
}
// Give 'a' tags second priority (but don't override 'e' tags)
if (tag[0] === 'a' && (!sourceTag || sourceTag[0] !== 'e')) {
sourceTag = tag
continue
}
// Give 'r' tags lowest priority
if (tag[0] === 'r' && (!sourceTag || sourceTag[0] === 'r')) {
sourceTag = tag
continue
}
}
}
// Process the selected source tag
if (sourceTag) {
if (sourceTag[0] === 'e') {
sources.push({
type: 'event',
value: sourceTag[1],
bech32: nip19.noteEncode(sourceTag[1])
})
} else if (sourceTag[0] === 'a') {
const [kind, pubkey, identifier] = sourceTag[1].split(':')
const relay = sourceTag[2]
sources.push({
type: 'addressable',
value: sourceTag[1],
bech32: nip19.naddrEncode({
kind: parseInt(kind),
pubkey,
identifier: identifier || '',
relays: relay ? [relay] : []
})
})
} else if (sourceTag[0] === 'r') {
sources.push({
type: 'url',
value: sourceTag[1],
bech32: sourceTag[1]
})
}
}
return sources
}
/**
* Get Nostr identifier type
*/
private getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub'
if (id.startsWith('nprofile')) return 'nprofile'
if (id.startsWith('nevent')) return 'nevent'
if (id.startsWith('naddr')) return 'naddr'
if (id.startsWith('note')) return 'note'
return null
}
/**
* Check if content has LaTeX math
*/
private hasMathContent(content: string): boolean {
// Check for inline math: $...$ or \(...\)
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content)
// Check for block math: $$...$$ or \[...\]
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content)
return inlineMath || blockMath
}
/**
* Parse content for a specific Nostr event field
*/
async parseEventField(
event: Event,
field: 'content' | 'title' | 'summary' | 'description',
options: Omit<ParseOptions, 'eventKind' | 'field'> = {}
): Promise<ParsedContent> {
const content = this.getFieldContent(event, field)
if (!content) {
return {
html: '',
markupType: 'plain-text',
cssClasses: getMarkupClasses('plain-text'),
hasMath: false,
media: [],
links: [],
hashtags: [],
nostrLinks: [],
highlightSources: []
}
}
return this.parseContent(content, {
...options,
eventKind: event.kind,
field
}, event)
}
/**
* Get content from specific event field
*/
private getFieldContent(event: Event, field: 'content' | 'title' | 'summary' | 'description'): string {
switch (field) {
case 'content':
return event.content
case 'title':
return event.tags.find(tag => tag[0] === 'title')?.[1] || ''
case 'summary':
return event.tags.find(tag => tag[0] === 'summary')?.[1] || ''
case 'description':
return event.tags.find(tag => tag[0] === 'd')?.[1] || ''
default:
return ''
}
}
/**
* Add proper CSS classes for styling
*/
private addStylingClasses(html: string): string {
let styled = html
// Add strikethrough styling
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>')
// Add subscript styling
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>')
// Add superscript styling
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>')
// Add code highlighting classes
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">')
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">')
return styled
}
/**
* Hide raw AsciiDoc ToC text that might appear in the content
*/
private hideRawTocText(html: string): string {
// Hide any raw ToC text that might be generated by AsciiDoc
// This includes patterns like "# Table of Contents (5)" and plain text lists
let cleaned = html
// Hide raw ToC headings and content
cleaned = cleaned.replace(
/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi,
''
)
// Hide raw ToC lists that might appear as plain text
cleaned = cleaned.replace(
/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi,
''
)
// Hide any remaining raw ToC text patterns
cleaned = cleaned.replace(
/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi,
''
)
return cleaned
}
}
// Export singleton instance
export const contentParserService = new ContentParserService()
export default contentParserService