/** * Comprehensive content parsing service for all Nostr content fields * Supports AsciiDoc, Advanced Markdown, Basic Markdown, and LaTeX */ import { detectMarkupType, getMarkupClasses, MarkupType } from '@/lib/markup-detection' import { Event, kinds, nip19 } from 'nostr-tools' import { getImetaInfosFromEvent } from '@/lib/event' import { URL_REGEX, ExtendedKind } from '@/constants' import { TImetaInfo } from '@/types' export interface ParsedContent { html: string markupType: MarkupType cssClasses: string hasMath: boolean media: TImetaInfo[] links: Array<{ url: string; text: string; isExternal: boolean }> hashtags: string[] nostrLinks: Array<{ type: 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note'; id: string; text: string }> highlightSources: Array<{ type: 'event' | 'addressable' | 'url'; value: string; bech32: string }> } export interface ParseOptions { eventKind?: number field?: 'content' | 'title' | 'summary' | 'description' maxWidth?: string enableMath?: boolean enableSyntaxHighlighting?: boolean } class ContentParserService { private asciidoctor: any = null private isAsciidoctorLoaded = false /** * Initialize AsciiDoctor (lazy loading) */ private async loadAsciidoctor() { if (this.isAsciidoctorLoaded) return this.asciidoctor try { const Asciidoctor = await import('@asciidoctor/core') this.asciidoctor = Asciidoctor.default() this.isAsciidoctorLoaded = true return this.asciidoctor } catch (error) { console.warn('Failed to load AsciiDoctor:', error) return null } } /** * Parse content with appropriate markup processor */ async parseContent( content: string, options: ParseOptions = {}, event?: Event ): Promise { const { eventKind, enableMath = true, enableSyntaxHighlighting = true } = options // Detect markup type const markupType = detectMarkupType(content, eventKind) const cssClasses = getMarkupClasses(markupType) // Extract all content elements // For article-type events, don't extract media as it should be rendered inline const isArticleType = eventKind === kinds.LongFormArticle || eventKind === ExtendedKind.WIKI_ARTICLE || eventKind === ExtendedKind.PUBLICATION || eventKind === ExtendedKind.PUBLICATION_CONTENT const media = isArticleType ? [] : this.extractAllMedia(content, event) const links = this.extractLinks(content) const hashtags = this.extractHashtags(content) const nostrLinks = this.extractNostrLinks(content) const highlightSources = event ? this.extractHighlightSources(event) : [] // Check for LaTeX math const hasMath = enableMath && this.hasMathContent(content) let html = '' try { // Convert everything to AsciiDoc format and process as AsciiDoc const asciidocContent = this.convertToAsciidoc(content, markupType) html = await this.parseAsciidoc(asciidocContent, { enableMath, enableSyntaxHighlighting }) } catch (error) { console.error('Content parsing error:', error) // Fallback to plain text html = this.parsePlainText(content) } return { html, markupType: 'asciidoc', cssClasses, hasMath, media, links, hashtags, nostrLinks, highlightSources } } /** * Parse AsciiDoc content */ private async parseAsciidoc(content: string, options: { enableMath: boolean; enableSyntaxHighlighting: boolean }): Promise { const asciidoctor = await this.loadAsciidoctor() if (!asciidoctor) { return this.parsePlainText(content) } // Check if content starts with level 3+ headers (=== or deeper) // Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===) // If content starts with level 3+, use book doctype which allows sections at any level const firstHeaderMatch = content.match(/^(={1,6})\s+/m) let doctype: 'article' | 'book' = 'article' if (firstHeaderMatch) { const firstHeaderLevel = firstHeaderMatch[1].length // If first header is level 3 or deeper, use book doctype // Book doctype allows sections at any level without requiring hierarchy if (firstHeaderLevel >= 3) { doctype = 'book' } } try { const result = asciidoctor.convert(content, { safe: 'safe', backend: 'html5', doctype: doctype, attributes: { 'showtitle': true, 'sectanchors': true, 'sectlinks': true, 'toc': 'left', 'toclevels': 6, 'toc-title': 'Table of Contents', 'source-highlighter': options.enableSyntaxHighlighting ? 'highlight.js' : 'none', 'stem': options.enableMath ? 'latexmath' : 'none', 'data-uri': true, 'imagesdir': '', 'linkcss': false, 'stylesheet': '', 'stylesdir': '', 'prewrap': true, 'sectnums': false, 'sectnumlevels': 6, 'experimental': true, 'compat-mode': false, 'attribute-missing': 'warn', 'attribute-undefined': 'warn', 'skip-front-matter': true, 'source-indent': 0, 'indent': 0, 'tabsize': 2, 'tabwidth': 2, 'hardbreaks': false, 'paragraph-rewrite': 'normal', 'sectids': true, 'idprefix': '', 'idseparator': '-', 'sectidprefix': '', 'sectidseparator': '-' } }) const htmlString = typeof result === 'string' ? result : result.toString() // Debug: log the AsciiDoc HTML output for troubleshooting if (process.env.NODE_ENV === 'development') { console.log('AsciiDoc HTML output:', htmlString.substring(0, 1000) + '...') } // Process wikilinks in the HTML output const processedHtml = this.processWikilinksInHtml(htmlString) // Process images: add max-width styling and prepare for carousel const imagesProcessedHtml = this.processImagesInHtml(processedHtml) // Clean up any leftover markdown syntax and hide raw ToC text const cleanedHtml = this.cleanupMarkdown(imagesProcessedHtml) // Add proper CSS classes for styling const styledHtml = this.addStylingClasses(cleanedHtml) // Hide any raw AsciiDoc ToC text that might appear in the content return this.hideRawTocText(styledHtml) } catch (error) { console.error('AsciiDoc parsing error:', error) return this.parsePlainText(content) } } /** * Convert content to AsciiDoc format based on markup type */ private convertToAsciidoc(content: string, markupType: string): string { let asciidoc = '' switch (markupType) { case 'asciidoc': // For AsciiDoc content, ensure proper formatting // Convert escaped newlines to actual newlines asciidoc = content.replace(/\\n/g, '\n') // Ensure headers are on their own lines with proper spacing // AsciiDoc requires blank lines before headers when they follow other content // Fix pattern: non-empty line + newline + header without blank line between asciidoc = asciidoc.replace(/(\S[^\n]*)\n(={1,6}\s+[^\n]+)/g, (_match, before, header) => { // Add blank line before header if it follows non-empty content return `${before}\n\n${header}` }) break case 'advanced-markdown': case 'basic-markdown': asciidoc = this.convertMarkdownToAsciidoc(content) break case 'plain-text': default: asciidoc = this.convertPlainTextToAsciidoc(content) break } // Process wikilinks for all content types let result = this.processWikilinks(asciidoc) // Process nostr: addresses - convert them to proper AsciiDoc format result = this.processNostrAddresses(result) // Process hashtags - convert them to proper AsciiDoc format result = this.processHashtags(result) // Debug: log the converted AsciiDoc for troubleshooting if (process.env.NODE_ENV === 'development') { console.log('Converted AsciiDoc:', result) } return result } /** * Convert Markdown to AsciiDoc format */ private convertMarkdownToAsciidoc(content: string): string { // Preprocess: convert escaped newlines to actual newlines let asciidoc = content.replace(/\\n/g, '\n') // Preprocess: Fix the specific issue where backticks are used for inline code but not as code blocks // Look for patterns like `sqlite` (databased) and convert them properly asciidoc = asciidoc.replace(/`([^`\n]+)`\s*\(([^)]+)\)/g, '`$1` ($2)') // Fix spacing issues where text runs together asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`([a-zA-Z0-9])/g, '$1 `$2` $3') asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`\s*\(/g, '$1 `$2` (') asciidoc = asciidoc.replace(/\)`([^`\n]+)`([a-zA-Z0-9])/g, ') `$1` $2') // Fix specific pattern: text)text -> text) text asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2') // Fix specific pattern: text== -> text == asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 ==') // Handle nostr: addresses - preserve them as-is for now, they'll be processed later // This prevents them from being converted to AsciiDoc link syntax asciidoc = asciidoc.replace(/nostr:([a-z0-9]+)/g, 'nostr:$1') // Convert headers - process in order from most specific to least specific asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======') asciidoc = asciidoc.replace(/^#{5}\s+(.+)$/gm, '===== $1 =====') asciidoc = asciidoc.replace(/^#{4}\s+(.+)$/gm, '==== $1 ====') asciidoc = asciidoc.replace(/^#{3}\s+(.+)$/gm, '=== $1 ===') asciidoc = asciidoc.replace(/^#{2}\s+(.+)$/gm, '== $1 ==') asciidoc = asciidoc.replace(/^#{1}\s+(.+)$/gm, '= $1 =') // Convert markdown-style == headers to AsciiDoc asciidoc = asciidoc.replace(/^==\s+(.+?)\s+==$/gm, '== $1 ==') // Also handle inline == headers that might appear in the middle of text asciidoc = asciidoc.replace(/\s==\s+([^=]+?)\s+==\s/g, ' == $1 == ') // Convert emphasis - handle both single and double asterisks/underscores asciidoc = asciidoc.replace(/\*\*(.+?)\*\*/g, '*$1*') // Bold **text** asciidoc = asciidoc.replace(/__(.+?)__/g, '*$1*') // Bold __text__ asciidoc = asciidoc.replace(/\*(.+?)\*/g, '_$1_') // Italic *text* asciidoc = asciidoc.replace(/_(.+?)_/g, '_$1_') // Italic _text_ asciidoc = asciidoc.replace(/~~(.+?)~~/g, '[line-through]#$1#') // Strikethrough asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#') // Subscript asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#') // Superscript // Convert code blocks - use more precise matching to avoid capturing regular text asciidoc = asciidoc.replace(/```(\w+)?\n([\s\S]*?)\n```/g, (_match, lang, code) => { // Ensure we don't capture too much content and it looks like actual code const trimmedCode = code.trim() if (trimmedCode.length === 0) return '' // Check if this looks like actual code (has programming syntax patterns) const hasCodePatterns = /[{}();=<>]|function|class|import|export|def |if |for |while |return |const |let |var |public |private |static |console\.log|var |let |const |if |for |while |return |function/.test(trimmedCode) // Additional checks for common non-code patterns const isLikelyText = /^[A-Za-z\s.,!?\-'"]+$/.test(trimmedCode) && trimmedCode.length > 50 const hasTooManySpaces = (trimmedCode.match(/\s{3,}/g) || []).length > 3 const hasMarkdownPatterns = /^#{1,6}\s|^\*\s|^\d+\.\s|^\>\s|^\|.*\|/.test(trimmedCode) // If it doesn't look like code, has too many spaces, or looks like markdown, treat as regular text if ((!hasCodePatterns && trimmedCode.length > 100) || isLikelyText || hasTooManySpaces || hasMarkdownPatterns) { return _match // Return original markdown } return `[source${lang ? ',' + lang : ''}]\n----\n${trimmedCode}\n----` }) asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`') // Inline code // Handle LaTeX math in inline code - preserve $...$ syntax asciidoc = asciidoc.replace(/`\$([^$]+)\$`/g, '`$\\$1\\$$`') // Convert images - use proper AsciiDoc image syntax asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, 'image::$2[$1,width=100%]') // Also handle the specific format: image::url[alt,width=100%] that's already in the content // This ensures it's properly formatted for AsciiDoc asciidoc = asciidoc.replace(/image::([^\[]+)\[([^\]]+),width=100%\]/g, 'image::$1[$2,width=100%]') // Convert links asciidoc = asciidoc.replace(/\[([^\]]+)\]\(([^)]+)\)/g, 'link:$2[$1]') // Convert horizontal rules asciidoc = asciidoc.replace(/^---$/gm, '\n---\n') // Convert unordered lists asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2') asciidoc = asciidoc.replace(/^(\s*)-\s+(.+)$/gm, '$1* $2') asciidoc = asciidoc.replace(/^(\s*)\+\s+(.+)$/gm, '$1* $2') // Convert ordered lists asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2') // Convert blockquotes - handle multiline blockquotes properly with separate attribution asciidoc = asciidoc.replace(/^(>\s+.+(?:\n>\s+.+)*)/gm, (match) => { const lines = match.split('\n').map(line => line.replace(/^>\s*/, '')) // Remove '>' and optional space from each line let quoteBodyLines: string[] = [] let attributionLine: string | undefined // Find the last line that looks like an attribution (starts with '—' or '--') for (let i = lines.length - 1; i >= 0; i--) { const line = lines[i].trim() if (line.startsWith('—') || line.startsWith('--')) { attributionLine = line quoteBodyLines = lines.slice(0, i) // Everything before the attribution is the quote body break } } const quoteContent = quoteBodyLines.filter(l => l.trim() !== '').join('\n').trim() if (attributionLine) { // Remove leading '—' or '--' from the attribution line let cleanedAttribution = attributionLine.replace(/^[—-]+/, '').trim() let author = '' let source = '' // Try to find a link:url[text] pattern (already converted from markdown links) // Example: "George Bernard Shaw, link:https://www.goodreads.com/work/quotes/376394[Man and Superman]" const linkMatch = cleanedAttribution.match(/^(.*?),?\s*link:([^[\\]]+)\[([^\\]]+)\]$/) if (linkMatch) { author = linkMatch[1].trim() // Use the AsciiDoc link format directly in the source attribute source = `link:${linkMatch[2].trim()}[${linkMatch[3].trim()}]` } else { // If no link, assume the whole thing is author or author, sourceText const parts = cleanedAttribution.split(',').map(p => p.trim()) author = parts[0] if (parts.length > 1) { source = parts.slice(1).join(', ').trim() } } // AsciiDoc blockquote with attribution: [quote, author, source] return `[quote, ${author}, ${source}]\n____\n${quoteContent}\n____` } else { // If no attribution line is found, render as a regular AsciiDoc blockquote return `____\n${quoteContent}\n____` } }) // Convert lists asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2') // Unordered lists asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2') // Ordered lists // Convert links asciidoc = asciidoc.replace(/\[([^\]]+)\]\(([^)]+)\)/g, 'link:$2[$1]') // Convert images asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, 'image::$2[$1]') // Convert tables (basic support) - handle markdown tables properly asciidoc = asciidoc.replace(/^\|(.+)\|$/gm, (match, content) => { // Check if this is a table row (not just a single cell) const cells = content.split('|').map((cell: string) => cell.trim()).filter((cell: string) => cell) if (cells.length > 1) { return '|' + content + '|' } return match }) // Fix table rendering by ensuring proper AsciiDoc table format asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => { const lines = match.trim().split('\n').filter(line => line.trim()) if (lines.length < 2) return match const headerRow = lines[0] const separatorRow = lines[1] const dataRows = lines.slice(2) // Check if it's actually a table (has separator row with dashes) if (!separatorRow.includes('-')) return match // Convert to proper AsciiDoc table format let tableAsciidoc = '[cols="1,1"]\n|===\n' tableAsciidoc += headerRow + '\n' dataRows.forEach(row => { tableAsciidoc += row + '\n' }) tableAsciidoc += '|===' return tableAsciidoc }) // Convert horizontal rules asciidoc = asciidoc.replace(/^---$/gm, '\'\'\'') // Convert footnotes - handle both references and definitions for auto-numbering const footnoteDefinitions: { [id: string]: string } = {} let tempAsciidoc = asciidoc // First, extract all footnote definitions and remove them from the content // This regex captures [^id]: text including multi-line content tempAsciidoc = tempAsciidoc.replace(/^\[\^([^\]]+)\]:\s*([\s\S]*?)(?=\n\[\^|\n---|\n##|\n###|\n####|\n#####|\n######|$)/gm, (_, id, text) => { footnoteDefinitions[id] = text.trim() return '' // Remove the definition line from the content }) // Then, replace all footnote references [^id] with AsciiDoc's auto-numbered footnote syntax // using the extracted definitions. asciidoc = tempAsciidoc.replace(/\[\^([^\]]+)\]/g, (match, id) => { if (footnoteDefinitions[id]) { return `footnote:[${footnoteDefinitions[id]}]` } return match // If definition not found, leave as is }) return asciidoc } /** * Process nostr: addresses in content */ private processNostrAddresses(content: string): string { let processed = content // Process nostr: addresses - convert them to AsciiDoc link format // This regex matches nostr: followed by any valid bech32 string processed = processed.replace(/nostr:([a-z0-9]+[a-z0-9]{6,})/g, (_match, bech32Id) => { // Create AsciiDoc link with nostr: prefix return `link:nostr:${bech32Id}[${bech32Id}]` }) return processed } /** * Process hashtags in content */ private processHashtags(content: string): string { let processed = content // Convert hashtags to AsciiDoc link format: #hashtag -> hashtag:tag[#tag] // This regex matches # followed by word characters, avoiding those in URLs, code blocks, etc. // Using word boundary approach to avoid matching # in URLs processed = processed.replace(/\B#([a-zA-Z0-9_]+)/g, (_match, hashtag) => { // Normalize hashtag to lowercase for consistency const normalizedHashtag = hashtag.toLowerCase() return `hashtag:${normalizedHashtag}[#${hashtag}]` }) return processed } /** * Process wikilinks in content (both standard and bookstr macro) */ private processWikilinks(content: string): string { let processed = content // Process bookstr macro wikilinks: [[book:...]] where ... can be any book type and reference processed = processed.replace(/\[\[book:([^\]]+)\]\]/g, (_match, bookContent) => { const cleanContent = bookContent.trim() const dTag = this.normalizeDtag(cleanContent) return `wikilink:${dTag}[${cleanContent}]` }) // Process standard wikilinks: [[Target Page]] or [[target page|see this]] processed = processed.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, displayText) => { const cleanTarget = target.trim() const cleanDisplay = displayText ? displayText.trim() : cleanTarget const dTag = this.normalizeDtag(cleanTarget) return `wikilink:${dTag}[${cleanDisplay}]` }) return processed } /** * Normalize text to d-tag format (lowercase, non-letters to dashes) */ private normalizeDtag(text: string): string { return text .toLowerCase() .replace(/[^a-z0-9]+/g, '-') .replace(/^-+|-+$/g, '') } /** * Process wikilinks and nostr links in HTML output */ private processWikilinksInHtml(html: string): string { let processed = html // Convert hashtag links to HTML with green styling processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => { return `${displayText}` }) // Convert wikilink:dtag[display] format to HTML with data attributes processed = processed.replace(/wikilink:([^[]+)\[([^\]]+)\]/g, (_match, dTag, displayText) => { return `${displayText}` }) // Convert nostr: links to proper embedded components processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => { const nostrType = this.getNostrType(bech32Id) if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') { // Render as embedded event return `
Loading embedded event...
` } else if (nostrType === 'npub' || nostrType === 'nprofile') { // Render as user handle return `@${displayText}` } else { // Fallback to regular link return `${displayText}` } }) return processed } /** * Process images in HTML output: add max-width styling and data attributes for carousel */ private processImagesInHtml(html: string): string { let processed = html // Extract all image URLs for carousel const imageUrls: string[] = [] const imageUrlRegex = /]+src=["']([^"']+)["'][^>]*>/gi let match while ((match = imageUrlRegex.exec(html)) !== null) { const url = match[1] if (url && !imageUrls.includes(url)) { imageUrls.push(url) } } // Process each img tag: add max-width styling and data attributes processed = processed.replace(/]+)>/gi, (imgTag, attributes) => { // Extract src attribute const srcMatch = attributes.match(/src=["']([^"']+)["']/i) if (!srcMatch) return imgTag const src = srcMatch[1] const currentIndex = imageUrls.indexOf(src) // Add/update class for max-width let updatedAttributes = attributes if (updatedAttributes.match(/class=["']/i)) { updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => { // Remove existing max-w classes and add our max-w-[400px] const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim() const newClasses = cleanedClasses ? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in` : 'max-w-[400px] object-contain cursor-zoom-in' return `class="${newClasses}"` }) } else { updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"` } // Add data attributes for carousel updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '"')}"` return `` }) return processed } /** * Convert plain text to AsciiDoc format */ private convertPlainTextToAsciidoc(content: string): string { // Convert line breaks to AsciiDoc format return content .replace(/\n\n/g, '\n\n') .replace(/\n/g, ' +\n') } /** * Parse plain text content */ private parsePlainText(content: string): string { // Convert line breaks to HTML return content .replace(/\n\n/g, '

') .replace(/\n/g, '
') .replace(/^/, '

') .replace(/$/, '

') } /** * Clean up leftover markdown syntax after AsciiDoc processing */ private cleanupMarkdown(html: string): string { let cleaned = html // Clean up markdown image syntax: ![alt](url) cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => { const altText = alt || '' return `${altText}` }) // Clean up markdown link syntax: [text](url) cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => { // Check if it's already an HTML link if (cleaned.includes(`href="${url}"`)) { return _match } return `${text} ` }) // Fix broken HTML attributes that are being rendered as text cleaned = cleaned.replace(/" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">([^<]+) ]*>]*><\/path><\/svg><\/a>/g, (_match, text) => { return `" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${text} ` }) // Fix broken image HTML cleaned = cleaned.replace(/" alt="([^"]*)" class="max-w-\[400px\] object-contain my-0" \/>/g, (_match, alt) => { return `" alt="${alt}" class="max-w-[400px] object-contain my-0" />` }) // Clean up markdown table syntax cleaned = this.cleanupMarkdownTables(cleaned) return cleaned } /** * Clean up markdown tables */ private cleanupMarkdownTables(html: string): string { const tableRegex = /(\|.*\|[\r\n]+\|[\s\-\|]*[\r\n]+(\|.*\|[\r\n]+)*)/g return html.replace(tableRegex, (match) => { const lines = match.trim().split('\n').filter(line => line.trim()) if (lines.length < 2) return match const headerRow = lines[0] const separatorRow = lines[1] const dataRows = lines.slice(2) // Check if it's actually a table (has separator row with dashes) if (!separatorRow.includes('-')) return match const headers = headerRow.split('|').map(cell => cell.trim()).filter(cell => cell) const rows = dataRows.map(row => row.split('|').map(cell => cell.trim()).filter(cell => cell) ) let tableHtml = '\n' // Header tableHtml += ' \n \n' headers.forEach(header => { tableHtml += ` \n` }) tableHtml += ' \n \n' // Body tableHtml += ' \n' rows.forEach(row => { tableHtml += ' \n' row.forEach((cell, index) => { const tag = index < headers.length ? 'td' : 'td' tableHtml += ` <${tag} class="border border-gray-300 px-4 py-2">${cell}\n` }) tableHtml += ' \n' }) tableHtml += ' \n' tableHtml += '
${header}
' return tableHtml }) } /** * Extract all media from content and event */ private extractAllMedia(content: string, event?: Event): TImetaInfo[] { const media: TImetaInfo[] = [] const seenUrls = new Set() // 1. Extract from imeta tags if event is provided if (event) { const imetaMedia = getImetaInfosFromEvent(event) imetaMedia.forEach(item => { if (!seenUrls.has(item.url)) { media.push(item) seenUrls.add(item.url) } }) } // 2. Extract from markdown images: ![alt](url) const imageMatches = content.match(/!\[[^\]]*\]\(([^)]+)\)/g) || [] imageMatches.forEach(match => { const url = match.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1] if (url && !seenUrls.has(url)) { const isVideo = /\.(mp4|webm|ogg)$/i.test(url) media.push({ url, pubkey: event?.pubkey || '', m: isVideo ? 'video/*' : 'image/*' }) seenUrls.add(url) } }) // 3. Extract from asciidoc images: image::url[alt,width] const asciidocImageMatches = content.match(/image::([^\[]+)\[/g) || [] asciidocImageMatches.forEach(match => { const url = match.match(/image::([^\[]+)\[/)?.[1] if (url && !seenUrls.has(url)) { const isVideo = /\.(mp4|webm|ogg)$/i.test(url) media.push({ url, pubkey: event?.pubkey || '', m: isVideo ? 'video/*' : 'image/*' }) seenUrls.add(url) } }) // 4. Extract raw URLs from content const rawUrls = content.match(URL_REGEX) || [] rawUrls.forEach(url => { if (!seenUrls.has(url)) { const isImage = /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url) const isVideo = /\.(mp4|webm|ogg)$/i.test(url) if (isImage || isVideo) { media.push({ url, pubkey: event?.pubkey || '', m: isVideo ? 'video/*' : 'image/*' }) seenUrls.add(url) } } }) return media } /** * Extract all links from content */ private extractLinks(content: string): Array<{ url: string; text: string; isExternal: boolean }> { const links: Array<{ url: string; text: string; isExternal: boolean }> = [] const seenUrls = new Set() // Extract markdown links: [text](url) const markdownLinks = content.match(/\[([^\]]+)\]\(([^)]+)\)/g) || [] markdownLinks.forEach(_match => { const linkMatch = _match.match(/\[([^\]]+)\]\(([^)]+)\)/) if (linkMatch) { const [, text, url] = linkMatch if (!seenUrls.has(url)) { links.push({ url, text, isExternal: this.isExternalUrl(url) }) seenUrls.add(url) } } }) // Extract asciidoc links: link:url[text] const asciidocLinks = content.match(/link:([^\[]+)\[([^\]]+)\]/g) || [] asciidocLinks.forEach(_match => { const linkMatch = _match.match(/link:([^\[]+)\[([^\]]+)\]/) if (linkMatch) { const [, url, text] = linkMatch if (!seenUrls.has(url)) { links.push({ url, text, isExternal: this.isExternalUrl(url) }) seenUrls.add(url) } } }) // Extract raw URLs const rawUrls = content.match(URL_REGEX) || [] rawUrls.forEach(url => { if (!seenUrls.has(url) && !this.isNostrUrl(url)) { links.push({ url, text: url, isExternal: this.isExternalUrl(url) }) seenUrls.add(url) } }) return links } /** * Extract hashtags from content */ private extractHashtags(content: string): string[] { const hashtags: string[] = [] const seenTags = new Set() // Extract hashtags: #hashtag const hashtagMatches = content.match(/#([a-zA-Z0-9_]+)/g) || [] hashtagMatches.forEach(_match => { const tag = _match.substring(1) // Remove # if (!seenTags.has(tag)) { hashtags.push(tag) seenTags.add(tag) } }) return hashtags } /** * Extract Nostr links from content */ private extractNostrLinks(content: string): Array<{ type: 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note'; id: string; text: string }> { const nostrLinks: Array<{ type: 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note'; id: string; text: string }> = [] // Extract nostr: prefixed links const nostrMatches = content.match(/nostr:([a-z0-9]+[a-z0-9]{6,})/g) || [] nostrMatches.forEach(_match => { const id = _match.substring(6) // Remove 'nostr:' const type = this.getNostrType(id) if (type) { nostrLinks.push({ type, id, text: _match }) } }) // Extract raw nostr identifiers const rawNostrMatches = content.match(/([a-z0-9]+[a-z0-9]{6,})/g) || [] rawNostrMatches.forEach(_match => { const type = this.getNostrType(_match) if (type && !nostrLinks.some(link => link.id === _match)) { nostrLinks.push({ type, id: _match, text: _match }) } }) return nostrLinks } /** * Check if URL is external */ private isExternalUrl(url: string): boolean { try { const urlObj = new URL(url) return urlObj.hostname !== window.location.hostname } catch { return true } } /** * Check if URL is a Nostr URL */ private isNostrUrl(url: string): boolean { return url.startsWith('nostr:') || this.getNostrType(url) !== null } /** * Extract highlight sources from event tags */ private extractHighlightSources(event: Event): Array<{ type: 'event' | 'addressable' | 'url'; value: string; bech32: string }> { const sources: Array<{ type: 'event' | 'addressable' | 'url'; value: string; bech32: string }> = [] // Check for 'source' marker first (highest priority) let sourceTag: string[] | undefined for (const tag of event.tags) { if (tag[2] === 'source' || tag[3] === 'source') { sourceTag = tag break } } // If no 'source' marker found, process tags in priority order: e > a > r if (!sourceTag) { for (const tag of event.tags) { // Give 'e' tags highest priority if (tag[0] === 'e') { sourceTag = tag continue } // Give 'a' tags second priority (but don't override 'e' tags) if (tag[0] === 'a' && (!sourceTag || sourceTag[0] !== 'e')) { sourceTag = tag continue } // Give 'r' tags lowest priority if (tag[0] === 'r' && (!sourceTag || sourceTag[0] === 'r')) { sourceTag = tag continue } } } // Process the selected source tag if (sourceTag) { if (sourceTag[0] === 'e') { sources.push({ type: 'event', value: sourceTag[1], bech32: nip19.noteEncode(sourceTag[1]) }) } else if (sourceTag[0] === 'a') { const [kind, pubkey, identifier] = sourceTag[1].split(':') const relay = sourceTag[2] sources.push({ type: 'addressable', value: sourceTag[1], bech32: nip19.naddrEncode({ kind: parseInt(kind), pubkey, identifier: identifier || '', relays: relay ? [relay] : [] }) }) } else if (sourceTag[0] === 'r') { sources.push({ type: 'url', value: sourceTag[1], bech32: sourceTag[1] }) } } return sources } /** * Get Nostr identifier type */ private getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null { if (id.startsWith('npub')) return 'npub' if (id.startsWith('nprofile')) return 'nprofile' if (id.startsWith('nevent')) return 'nevent' if (id.startsWith('naddr')) return 'naddr' if (id.startsWith('note')) return 'note' return null } /** * Check if content has LaTeX math */ private hasMathContent(content: string): boolean { // Check for inline math: $...$ or \(...\) const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content) // Check for block math: $$...$$ or \[...\] const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content) return inlineMath || blockMath } /** * Parse content for a specific Nostr event field */ async parseEventField( event: Event, field: 'content' | 'title' | 'summary' | 'description', options: Omit = {} ): Promise { const content = this.getFieldContent(event, field) if (!content) { return { html: '', markupType: 'plain-text', cssClasses: getMarkupClasses('plain-text'), hasMath: false, media: [], links: [], hashtags: [], nostrLinks: [], highlightSources: [] } } return this.parseContent(content, { ...options, eventKind: event.kind, field }, event) } /** * Get content from specific event field */ private getFieldContent(event: Event, field: 'content' | 'title' | 'summary' | 'description'): string { switch (field) { case 'content': return event.content case 'title': return event.tags.find(tag => tag[0] === 'title')?.[1] || '' case 'summary': return event.tags.find(tag => tag[0] === 'summary')?.[1] || '' case 'description': return event.tags.find(tag => tag[0] === 'd')?.[1] || '' default: return '' } } /** * Add proper CSS classes for styling */ private addStylingClasses(html: string): string { let styled = html // Add strikethrough styling styled = styled.replace(/([^<]+)<\/span>/g, '$1') // Add subscript styling styled = styled.replace(/([^<]+)<\/span>/g, '$1') // Add superscript styling styled = styled.replace(/([^<]+)<\/span>/g, '$1') // Add code highlighting classes styled = styled.replace(/
/g, '
')
    styled = styled.replace(//g, '')
    
    return styled
  }

  /**
   * Hide raw AsciiDoc ToC text that might appear in the content
   */
  private hideRawTocText(html: string): string {
    // Hide any raw ToC text that might be generated by AsciiDoc
    // This includes patterns like "# Table of Contents (5)" and plain text lists
    let cleaned = html

    // Hide raw ToC headings and content
    cleaned = cleaned.replace(
      /]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi,
      ''
    )

    // Hide raw ToC lists that might appear as plain text
    cleaned = cleaned.replace(
      /]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi,
      ''
    )

    // Hide any remaining raw ToC text patterns
    cleaned = cleaned.replace(
      /]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi,
      ''
    )

    return cleaned
  }
}

// Export singleton instance
export const contentParserService = new ContentParserService()
export default contentParserService