commit d689b1b050c4d910892918290de7bf06c7ced4e1 Author: Silberengel Date: Tue Mar 3 14:53:19 2026 +0100 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9071e5b --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +# Binaries +*.exe +*.exe~ +*.dll +*.so +*.dylib +gc-parser + +# Test binary +*.test + +# Output +*.out + +# Go workspace file +go.work + +# Node.js +node_modules/ +package-lock.json +dist/ +*.log + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db diff --git a/README.md b/README.md new file mode 100644 index 0000000..d3388e9 --- /dev/null +++ b/README.md @@ -0,0 +1,212 @@ +# GC Parser + +A super-parser for Nostr event content that handles multiple content formats including AsciiDoc, Markdown, code syntax highlighting, LaTeX, musical notation, and `nostr:` prefixed addresses. + +Built with TypeScript/JavaScript using: +- **asciidoctor.js** for AsciiDoc processing +- **marked** for Markdown processing +- **highlight.js** for code syntax highlighting + +## Features + +- **AsciiDoc Processing**: Full AsciiDoc to HTML conversion with table of contents support +- **Markdown Processing**: Markdown to HTML conversion with GFM support +- **Code Syntax Highlighting**: Automatic syntax highlighting for code blocks using highlight.js +- **LaTeX Math**: Support for inline and block LaTeX math expressions (compatible with MathJax/KaTeX) +- **Musical Notation**: Support for ABC notation, LilyPond, chord notation, and MusicXML +- **Nostr Addresses**: Automatic processing of `nostr:` prefixed addresses (naddr, nevent, note, npub, nprofile) +- **Link Rewriting**: Automatic rewriting of wikilinks and nostr addresses to proper URLs +- **HTML Sanitization**: Built-in XSS protection + +## Installation + +```bash +npm install gc-parser +``` + +## Usage + +### Basic Example + +```typescript +import { Parser, defaultOptions } from 'gc-parser'; + +// Create parser with default options +const opts = defaultOptions(); +opts.linkBaseURL = 'https://example.com'; + +const parser = new Parser(opts); + +// Process content +const content = `# Hello World + +This is **markdown** content with a nostr:npub1... address.`; + +const result = await parser.process(content); +console.log(result.content); +console.log('Has LaTeX:', result.hasLaTeX); +console.log('Has Musical Notation:', result.hasMusicalNotation); +``` + +### Advanced Configuration + +```typescript +import { Parser } from 'gc-parser'; + +const parser = new Parser({ + linkBaseURL: 'https://example.com', + enableAsciiDoc: true, + enableMarkdown: true, + enableCodeHighlighting: true, + enableLaTeX: true, + enableMusicalNotation: true, + enableNostrAddresses: true, +}); + +const result = await parser.process(content); +``` + +### Processing AsciiDoc + +```typescript +const content = `= Document Title + +== Section + +This is AsciiDoc content with a [[wikilink]] and nostr:naddr1...`; + +const result = await parser.process(content); +// result.content contains the HTML +// result.tableOfContents contains the extracted TOC +``` + +### Processing Markdown + +```typescript +const content = `# Markdown Document + +This is **bold** and *italic* text. + +\`\`\`go +func main() { + fmt.Println("Hello") +} +\`\`\` +`; + +const result = await parser.process(content); +``` + +### LaTeX Math + +The parser automatically detects and processes LaTeX math expressions: + +- Inline math: `$E = mc^2$` or `\(E = mc^2\)` +- Block math: `$$\int_{-\infty}^{\infty} e^{-x^2} dx = \sqrt{\pi}$$` or `\[...\]` + +The output is compatible with MathJax or KaTeX. Include one of these libraries in your HTML: + +```html + + + + + + + +``` + +### Musical Notation + +The parser supports multiple musical notation formats: + +- **ABC Notation**: Automatically detected and wrapped for ABC.js +- **LilyPond**: Detected and wrapped for LilyPond rendering +- **Chord Notation**: Inline chords like `[C]`, `[Am]`, `[F#m7]` +- **MusicXML**: XML-based notation + +Example: +``` +X:1 +K:C +C D E F | G A B c +``` + +### Nostr Addresses + +The parser automatically processes `nostr:` prefixed addresses: + +- `nostr:naddr1...` - Parameterized replaceable events +- `nostr:nevent1...` - Event references +- `nostr:note1...` - Note IDs +- `nostr:npub1...` - Public keys +- `nostr:nprofile1...` - Profile references + +These are automatically converted to links if `linkBaseURL` is set. + +## Integration with gitcitadel-online + +This parser is designed to replace the content processing logic in `gitcitadel-online`. + +### Migration Example + +**Before (in gitcitadel-online):** +```go +// Old way - calling Node.js via exec +result, err := g.asciidocProc.Process(wiki.Content) +html := result.Content +``` + +**After (using gc-parser):** +```go +// New way - import the JavaScript/TypeScript module +// You can call it via Node.js exec or use a Go bridge +const { Parser } = require('gc-parser'); +const parser = new Parser({ linkBaseURL: 'https://example.com' }); +const result = await parser.process(content); +``` + +Or use it directly in a Node.js script that gitcitadel-online can call: + +```javascript +// process-content.js +const { Parser } = require('gc-parser'); + +const parser = new Parser({ + linkBaseURL: process.env.LINK_BASE_URL || '', +}); + +const content = process.argv[2] || ''; +parser.process(content).then(result => { + console.log(JSON.stringify(result)); +}).catch(err => { + console.error(err); + process.exit(1); +}); +``` + +## Requirements + +- Node.js 18+ +- TypeScript 5.3+ (for development) + +## Development + +```bash +# Install dependencies +npm install + +# Build TypeScript +npm run build + +# Run tests +npm test +``` + +## License + +MIT + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/example.js b/example.js new file mode 100644 index 0000000..e4408a2 --- /dev/null +++ b/example.js @@ -0,0 +1,55 @@ +#!/usr/bin/env node + +/** + * Example usage of gc-parser + * This can be called from Go or used directly in Node.js + */ + +const { Parser, defaultOptions } = require('./dist/index.js'); + +async function main() { + // Create parser with default options + const opts = defaultOptions(); + opts.linkBaseURL = process.env.LINK_BASE_URL || 'https://example.com'; + + const parser = new Parser(opts); + + // Get content from command line argument or stdin + let content = ''; + if (process.argv[2]) { + content = process.argv[2]; + } else { + // Read from stdin + const readline = require('readline'); + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + terminal: false + }); + + for await (const line of rl) { + content += line + '\n'; + } + } + + if (!content) { + console.error('No content provided'); + process.exit(1); + } + + try { + const result = await parser.process(content); + + // Output as JSON for easy parsing + console.log(JSON.stringify(result, null, 2)); + } catch (error) { + console.error('Error processing content:', error); + process.exit(1); + } +} + +if (require.main === module) { + main(); +} + +module.exports = { main }; diff --git a/package.json b/package.json new file mode 100644 index 0000000..70c1703 --- /dev/null +++ b/package.json @@ -0,0 +1,35 @@ +{ + "name": "gc-parser", + "version": "1.0.0", + "description": "Super-parser for Nostr event content supporting AsciiDoc, Markdown, code syntax highlighting, LaTeX, musical notation, and nostr: addresses", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "scripts": { + "build": "tsc", + "test": "jest", + "prepublishOnly": "npm run build" + }, + "keywords": [ + "nostr", + "parser", + "asciidoc", + "markdown", + "syntax-highlighting", + "latex", + "music" + ], + "author": "", + "license": "MIT", + "dependencies": { + "@asciidoctor/core": "^3.0.4", + "highlight.js": "^11.10.0", + "marked": "^12.0.0" + }, + "devDependencies": { + "@types/node": "^20.11.0", + "typescript": "^5.3.3", + "jest": "^29.7.0", + "@types/jest": "^29.5.11", + "@types/highlight.js": "^10.1.0" + } +} diff --git a/src/detector.ts b/src/detector.ts new file mode 100644 index 0000000..631ff53 --- /dev/null +++ b/src/detector.ts @@ -0,0 +1,55 @@ +import { ContentFormat } from './types'; + +/** + * Detects the content format based on content patterns + */ +export function detectFormat(content: string): ContentFormat { + // Check for AsciiDoc indicators + const asciidocIndicators = [ + '= ', // Title + '== ', // Section + '=== ', // Subsection + 'include::', // Include directive + 'image::', // Image block + '[source', // Source block + '----', // Listing block + '....', // Literal block + '|===', // Table + ':', // Attribute (common in AsciiDoc) + ]; + + let asciidocScore = 0; + for (const indicator of asciidocIndicators) { + if (content.includes(indicator)) { + asciidocScore++; + } + } + + // Check for Markdown indicators + const markdownIndicators = [ + '# ', // Heading + '## ', // Subheading + '```', // Code block + '**', // Bold + '*', // Italic or list + '- ', // List item + '![', // Image + '[', // Link + ]; + + let markdownScore = 0; + for (const indicator of markdownIndicators) { + if (content.includes(indicator)) { + markdownScore++; + } + } + + // Determine format based on scores + if (asciidocScore > markdownScore && asciidocScore >= 2) { + return ContentFormat.AsciiDoc; + } else if (markdownScore > 0) { + return ContentFormat.Markdown; + } + + return ContentFormat.Plain; +} diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..cddc130 --- /dev/null +++ b/src/index.ts @@ -0,0 +1,2 @@ +export * from './parser'; +export * from './types'; diff --git a/src/parser.ts b/src/parser.ts new file mode 100644 index 0000000..13e54a6 --- /dev/null +++ b/src/parser.ts @@ -0,0 +1,116 @@ +import { ParserOptions, ProcessResult, ContentFormat } from './types'; +import { processAsciiDoc } from './processors/asciidoc'; +import { processMarkdown } from './processors/markdown'; +import { processPlainText } from './processors/plain'; +import { processNostrAddresses } from './processors/nostr'; +import { detectFormat } from './detector'; +import { processLaTeX, hasLaTeX } from './processors/latex'; +import { processMusicalNotation, hasMusicalNotation } from './processors/music'; +import { ensureCodeHighlighting } from './processors/code'; + +/** + * Default parser options + */ +export function defaultOptions(): ParserOptions { + return { + linkBaseURL: '', + enableAsciiDoc: true, + enableMarkdown: true, + enableCodeHighlighting: true, + enableLaTeX: true, + enableMusicalNotation: true, + enableNostrAddresses: true, + }; +} + +/** + * Main parser for Nostr event content + * Handles multiple content formats: AsciiDoc, Markdown, code syntax, + * LaTeX, musical notation, and nostr: prefixed addresses + */ +export class Parser { + private options: Required; + + constructor(options: ParserOptions = {}) { + const defaults = defaultOptions(); + this.options = { + linkBaseURL: options.linkBaseURL ?? defaults.linkBaseURL ?? '', + enableAsciiDoc: options.enableAsciiDoc ?? defaults.enableAsciiDoc ?? true, + enableMarkdown: options.enableMarkdown ?? defaults.enableMarkdown ?? true, + enableCodeHighlighting: options.enableCodeHighlighting ?? defaults.enableCodeHighlighting ?? true, + enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true, + enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true, + enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true, + }; + } + + /** + * Process Nostr event content and return HTML + * Automatically detects the content format and processes accordingly + */ + async process(content: string): Promise { + // First, process nostr: addresses (if enabled) + if (this.options.enableNostrAddresses) { + content = processNostrAddresses(content, this.options.linkBaseURL); + } + + // Detect content format + const format = detectFormat(content); + + let result: ProcessResult; + + switch (format) { + case ContentFormat.AsciiDoc: + if (this.options.enableAsciiDoc) { + result = await processAsciiDoc(content, this.options.linkBaseURL); + } else if (this.options.enableMarkdown) { + // Fallback to markdown if AsciiDoc is disabled + result = await processMarkdown(content, this.options.linkBaseURL); + } else { + result = processPlainText(content); + } + break; + case ContentFormat.Markdown: + if (this.options.enableMarkdown) { + result = await processMarkdown(content, this.options.linkBaseURL); + } else { + // Fallback to plain text + result = processPlainText(content); + } + break; + default: + // Plain text or mixed content + result = processPlainText(content); + } + + // Post-process: handle LaTeX and musical notation in the HTML + if (this.options.enableLaTeX) { + result.hasLaTeX = hasLaTeX(result.content); + if (result.hasLaTeX) { + result.content = processLaTeX(result.content); + } + } + + if (this.options.enableMusicalNotation) { + result.hasMusicalNotation = hasMusicalNotation(result.content); + if (result.hasMusicalNotation) { + result.content = processMusicalNotation(result.content); + } + } + + // Ensure code highlighting is applied if enabled + if (this.options.enableCodeHighlighting) { + result.content = ensureCodeHighlighting(result.content); + } + + return result; + } +} + +/** + * Convenience function to process content with default options + */ +export async function process(content: string, options?: ParserOptions): Promise { + const parser = new Parser(options); + return parser.process(content); +} diff --git a/src/processors/asciidoc-links.ts b/src/processors/asciidoc-links.ts new file mode 100644 index 0000000..a9aa9ea --- /dev/null +++ b/src/processors/asciidoc-links.ts @@ -0,0 +1,66 @@ +/** + * Normalizes a d tag according to NIP-54 rules + */ +export function normalizeDTag(dTag: string): string { + // Convert to lowercase + let normalized = dTag.toLowerCase(); + + // Convert whitespace to hyphens + normalized = normalized.replace(/\s+/g, '-'); + + // Remove punctuation and symbols (keep alphanumeric, hyphens, and non-ASCII) + normalized = normalized.replace(/[^a-z0-9\-\u0080-\uFFFF]/g, ''); + + // Collapse multiple consecutive hyphens + normalized = normalized.replace(/-+/g, '-'); + + // Remove leading and trailing hyphens + normalized = normalized.replace(/^-+|-+$/g, ''); + + return normalized; +} + +/** + * Rewrites wikilinks and nostr: links in AsciiDoc content + */ +export function rewriteAsciiDocLinks(content: string, linkBaseURL: string): string { + // Rewrite wikilinks: [[target]] or [[target|display text]] + // Format: [[target]] -> link:url[display] + const wikilinkRegex = /\[\[([^\]]+)\]\]/g; + content = content.replace(wikilinkRegex, (match, inner) => { + let target: string; + let display: string; + + if (inner.includes('|')) { + const parts = inner.split('|', 2); + target = parts[0].trim(); + display = parts[1].trim(); + } else { + target = inner.trim(); + display = target; + } + + // Normalize the d tag + const normalized = normalizeDTag(target); + + // Create the link + if (linkBaseURL) { + const url = `${linkBaseURL}/events?d=${normalized}`; + return `link:${url}[${display}]`; + } + return `link:#${normalized}[${display}]`; + }); + + // Rewrite nostr: links: nostr:naddr1... or nostr:nevent1... + // Format: nostr:naddr1... -> link:url[nostr:naddr1...] + const nostrLinkRegex = /nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+)/g; + content = content.replace(nostrLinkRegex, (match, nostrID) => { + if (linkBaseURL) { + const url = `${linkBaseURL}/events?id=${nostrID}`; + return `link:${url}[${match}]`; + } + return match; + }); + + return content; +} diff --git a/src/processors/asciidoc.ts b/src/processors/asciidoc.ts new file mode 100644 index 0000000..7a545eb --- /dev/null +++ b/src/processors/asciidoc.ts @@ -0,0 +1,49 @@ +import asciidoctor from '@asciidoctor/core'; +import { ProcessResult } from '../types'; +import { rewriteAsciiDocLinks } from './asciidoc-links'; +import { extractTOC, sanitizeHTML, processLinks } from './html-utils'; + +const asciidoctorInstance = asciidoctor(); + +/** + * Processes AsciiDoc content to HTML + */ +export async function processAsciiDoc(content: string, linkBaseURL: string): Promise { + // Rewrite links in AsciiDoc content + const processedContent = rewriteAsciiDocLinks(content, linkBaseURL); + + // Convert AsciiDoc to HTML + const html = asciidoctorInstance.convert(processedContent, { + safe: 'safe', + backend: 'html5', + doctype: 'article', + attributes: { + showtitle: true, + icons: 'font', + sectanchors: true, + sectlinks: true, + toc: 'left', + toclevels: 3, + }, + }) as string; + + // Extract table of contents from HTML + const { toc, contentWithoutTOC } = extractTOC(html); + + // Sanitize HTML to prevent XSS + const sanitized = sanitizeHTML(contentWithoutTOC); + + // Process links: make external links open in new tab, local links in same tab + const processed = processLinks(sanitized, linkBaseURL); + + // Also sanitize and process links in TOC + const tocSanitized = sanitizeHTML(toc); + const tocProcessed = processLinks(tocSanitized, linkBaseURL); + + return { + content: processed, + tableOfContents: tocProcessed, + hasLaTeX: false, + hasMusicalNotation: false, + }; +} diff --git a/src/processors/code.ts b/src/processors/code.ts new file mode 100644 index 0000000..7d76303 --- /dev/null +++ b/src/processors/code.ts @@ -0,0 +1,52 @@ +import hljs from 'highlight.js'; + +/** + * Ensures code blocks have syntax highlighting using highlight.js + */ +export function ensureCodeHighlighting(html: string): string { + // Pattern to match code blocks:
...
or
...
+ const codeBlockRegex = /
]*>(.*?)<\/code><\/pre>/gs;
+
+  return html.replace(codeBlockRegex, (match, lang, code) => {
+    // Unescape HTML entities in code
+    const unescapedCode = unescapeHTML(code);
+
+    // Highlight the code
+    try {
+      let highlighted: hljs.HighlightResult;
+
+      if (lang) {
+        // Try to get the language
+        const language = hljs.getLanguage(lang);
+        if (language) {
+          highlighted = hljs.highlight(unescapedCode, { language: lang });
+        } else {
+          // Try auto-detection
+          highlighted = hljs.highlightAuto(unescapedCode);
+        }
+      } else {
+        // Auto-detect language
+        highlighted = hljs.highlightAuto(unescapedCode);
+      }
+
+      // Return highlighted code with proper classes
+      const langClass = highlighted.language ? ` class="language-${highlighted.language}"` : '';
+      return `
${highlighted.value}
`; + } catch (error) { + // If highlighting fails, return original + return match; + } + }); +} + +/** + * Unescapes HTML entities + */ +function unescapeHTML(text: string): string { + return text + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'"); +} diff --git a/src/processors/html-utils.ts b/src/processors/html-utils.ts new file mode 100644 index 0000000..2e56cba --- /dev/null +++ b/src/processors/html-utils.ts @@ -0,0 +1,170 @@ +/** + * Extracts the table of contents from AsciiDoc HTML output + * Returns the TOC HTML and the content HTML without the TOC + */ +export function extractTOC(html: string): { toc: string; contentWithoutTOC: string } { + // AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc" + let tocContent = ''; + let contentWithoutTOC = html; + + // Find the start of the TOC div - try multiple patterns + const tocStartPatterns = [ + /]*>/i, + /]*>/i, + /]*>/i, + /]*>/i, + ]; + + let tocStartIdx = -1; + let tocStartTag = ''; + + for (const pattern of tocStartPatterns) { + const match = html.match(pattern); + if (match && match.index !== undefined) { + tocStartIdx = match.index; + tocStartTag = match[0]; + break; + } + } + + if (tocStartIdx === -1) { + // No TOC found + return { toc: '', contentWithoutTOC: html }; + } + + // Find the matching closing tag by counting div tags + const searchStart = tocStartIdx + tocStartTag.length; + let depth = 1; + let i = searchStart; + + while (i < html.length && depth > 0) { + // Look for opening or closing div/nav tags + if (i + 4 < html.length && html.substring(i, i + 4) === '', i); + if (closeIdx === -1) break; + i = closeIdx + 1; + } else { + // Opening tag - find the end + const closeIdx = html.indexOf('>', i); + if (closeIdx === -1) break; + // Check if it's self-closing + if (html[closeIdx - 1] !== '/') { + depth++; + } + i = closeIdx + 1; + } + } else if (i + 5 < html.length && html.substring(i, i + 5) === '', i); + if (closeIdx === -1) break; + i = closeIdx + 1; + } else if (i + 5 < html.length && html.substring(i, i + 5) === '', i); + if (closeIdx === -1) break; + i = closeIdx + 1; + } else { + i++; + } + } + + if (depth === 0) { + // Found the matching closing tag + const tocEndIdx = i; + // Extract the TOC content (inner HTML) + const tocFullHTML = html.substring(tocStartIdx, tocEndIdx); + // Extract just the inner content (without the outer div tags) + let innerStart = tocStartTag.length; + let innerEnd = tocFullHTML.length; + // Find the last or + if (tocFullHTML.endsWith('')) { + innerEnd -= 6; + } else if (tocFullHTML.endsWith('')) { + innerEnd -= 7; + } + tocContent = tocFullHTML.substring(innerStart, innerEnd).trim(); + + // Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title) + tocContent = tocContent.replace(/]*>.*?<\/div>\s*/gis, ''); + tocContent = tocContent.trim(); + + // Remove the TOC from the content + contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx); + } + + return { toc: tocContent, contentWithoutTOC }; +} + +/** + * Performs basic HTML sanitization to prevent XSS + */ +export function sanitizeHTML(html: string): string { + // Remove script tags and their content + html = html.replace(/]*>.*?<\/script>/gis, ''); + + // Remove event handlers (onclick, onerror, etc.) + html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, ''); + + // Remove javascript: protocol in links + html = html.replace(/javascript:/gi, ''); + + // Remove data: URLs that could be dangerous + html = html.replace(/data:\s*text\/html/gi, ''); + + return html; +} + +/** + * Processes HTML links to add target="_blank" to external links + */ +export function processLinks(html: string, linkBaseURL: string): string { + // Extract domain from linkBaseURL for comparison + let linkBaseDomain = ''; + if (linkBaseURL) { + const url = linkBaseURL.replace(/^https?:\/\//, ''); + const parts = url.split('/'); + if (parts.length > 0) { + linkBaseDomain = parts[0]; + } + } + + // Regex to match tags with href attributes + const linkRegex = /]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g; + + return html.replace(linkRegex, (match, before, href, after) => { + // Check if it's an external link (starts with http:// or https://) + const isExternal = href.startsWith('http://') || href.startsWith('https://'); + + if (isExternal) { + // Check if it's pointing to our own domain + if (linkBaseDomain && href.includes(linkBaseDomain)) { + // Same domain - open in same tab (remove any existing target attribute) + return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); + } + + // External link - add target="_blank" and rel="noopener noreferrer" if not already present + if (!match.includes('target=')) { + if (!match.includes('rel=')) { + return match.replace('>', ' target="_blank" rel="noopener noreferrer">'); + } else { + // Update existing rel attribute to include noopener if not present + const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => { + if (!relValue.includes('noopener')) { + return `rel="${relValue} noopener noreferrer"`; + } + return relMatch; + }); + return updatedMatch.replace('>', ' target="_blank">'); + } + } + } else { + // Local/relative link - ensure it opens in same tab (remove target if present) + return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); + } + + return match; + }); +} diff --git a/src/processors/latex.ts b/src/processors/latex.ts new file mode 100644 index 0000000..5b52f3e --- /dev/null +++ b/src/processors/latex.ts @@ -0,0 +1,37 @@ +/** + * Checks if content contains LaTeX math expressions + */ +export function hasLaTeX(content: string): boolean { + // Check for inline math: $...$ or \(...\) + const inlineMathPattern = /\$[^$]+\$|\\\([^)]+\\\)/; + // Check for block math: $$...$$ or \[...\] + const blockMathPattern = /\$\$[^$]+\$\$|\\\[[^\]]+\\\]/; + + return inlineMathPattern.test(content) || blockMathPattern.test(content); +} + +/** + * Processes LaTeX math expressions in HTML content + * Wraps LaTeX expressions in appropriate HTML for rendering with MathJax or KaTeX + */ +export function processLaTeX(html: string): string { + // Process block math: $$...$$ or \[...\] + // Convert to
...
for MathJax/KaTeX + const blockMathPattern = /\$\$([^$]+)\$\$|\\\[([^\]]+)\\\]/gs; + html = html.replace(blockMathPattern, (match, dollarContent, bracketContent) => { + const mathContent = (dollarContent || bracketContent || '').trim(); + // Wrap in appropriate tags for MathJax/KaTeX + return `
\\[${mathContent}\\]
`; + }); + + // Process inline math: $...$ or \(...\) + // Convert to ... for MathJax/KaTeX + const inlineMathPattern = /\$([^$\n]+)\$|\\\(([^)]+)\\\)/g; + html = html.replace(inlineMathPattern, (match, dollarContent, bracketContent) => { + const mathContent = (dollarContent || bracketContent || '').trim(); + // Wrap in appropriate tags for MathJax/KaTeX + return `\\(${mathContent}\\)`; + }); + + return html; +} diff --git a/src/processors/markdown-links.ts b/src/processors/markdown-links.ts new file mode 100644 index 0000000..27e155d --- /dev/null +++ b/src/processors/markdown-links.ts @@ -0,0 +1,49 @@ +import { normalizeDTag } from './asciidoc-links'; + +/** + * Rewrites wikilinks and nostr: links in Markdown content + */ +export function rewriteMarkdownLinks(content: string, linkBaseURL: string): string { + // Rewrite wikilinks: [[target]] or [[target|display text]] + const wikilinkRegex = /\[\[([^\]]+)\]\]/g; + content = content.replace(wikilinkRegex, (match, inner) => { + let target: string; + let display: string; + + if (inner.includes('|')) { + const parts = inner.split('|', 2); + target = parts[0].trim(); + display = parts[1].trim(); + } else { + target = inner.trim(); + display = target; + } + + const normalized = normalizeDTag(target); + + if (linkBaseURL) { + const url = `${linkBaseURL}/events?d=${normalized}`; + return `[${display}](${url})`; + } + return `[${display}](#${normalized})`; + }); + + // Rewrite nostr: links in Markdown + const nostrLinkRegex = /nostr:(naddr1[^\s\]]+|nevent1[^\s\]]+|note1[^\s\]]+|npub1[^\s\]]+|nprofile1[^\s\]]+)/g; + content = content.replace(nostrLinkRegex, (match, nostrID) => { + if (linkBaseURL) { + let url: string; + if (nostrID.startsWith('npub')) { + url = `${linkBaseURL}/profile?pubkey=${nostrID}`; + } else if (nostrID.startsWith('nprofile')) { + url = `${linkBaseURL}/profile?id=${nostrID}`; + } else { + url = `${linkBaseURL}/events?id=${nostrID}`; + } + return `[${match}](${url})`; + } + return match; + }); + + return content; +} diff --git a/src/processors/markdown.ts b/src/processors/markdown.ts new file mode 100644 index 0000000..28e8b75 --- /dev/null +++ b/src/processors/markdown.ts @@ -0,0 +1,36 @@ +import { marked } from 'marked'; +import { ProcessResult } from '../types'; +import { rewriteMarkdownLinks } from './markdown-links'; +import { sanitizeHTML, processLinks } from './html-utils'; + +// Configure marked options +marked.setOptions({ + breaks: true, + gfm: true, + headerIds: true, + mangle: false, +}); + +/** + * Processes Markdown content to HTML + */ +export async function processMarkdown(content: string, linkBaseURL: string): Promise { + // Rewrite links in Markdown content + const processedContent = rewriteMarkdownLinks(content, linkBaseURL); + + // Convert Markdown to HTML + const html = await marked.parse(processedContent) as string; + + // Sanitize HTML to prevent XSS + const sanitized = sanitizeHTML(html); + + // Process links: make external links open in new tab, local links in same tab + const processed = processLinks(sanitized, linkBaseURL); + + return { + content: processed, + tableOfContents: '', + hasLaTeX: false, + hasMusicalNotation: false, + }; +} diff --git a/src/processors/music.ts b/src/processors/music.ts new file mode 100644 index 0000000..5064894 --- /dev/null +++ b/src/processors/music.ts @@ -0,0 +1,72 @@ +/** + * Checks if content contains musical notation + */ +export function hasMusicalNotation(content: string): boolean { + // Check for ABC notation: X:1, K:C, etc. + const abcPattern = /X:\s*\d+|K:\s*[A-G]|M:\s*\d+\/\d+/i; + // Check for LilyPond notation: \relative, \clef, etc. + const lilypondPattern = /\\relative|\\clef|\\key|\\time/; + // Check for MusicXML-like tags: , , etc. + const musicxmlPattern = /||/i; + // Check for simple chord notation: [C], [Am], etc. + const chordPattern = /\[[A-G][#b]?m?[0-9]?\]/; + + return abcPattern.test(content) || + lilypondPattern.test(content) || + musicxmlPattern.test(content) || + chordPattern.test(content); +} + +/** + * Processes musical notation in HTML content + * Wraps musical notation in appropriate HTML for rendering + */ +export function processMusicalNotation(html: string): string { + // Process ABC notation blocks + // ABC notation typically starts with X:1 and contains multiple lines + const abcBlockPattern = /(X:\s*\d+[^\n]*\n(?:[^\n]+\n)*)/gs; + html = html.replace(abcBlockPattern, (match) => { + const abcContent = match.trim(); + // Wrap in a div for ABC.js or similar renderer + return `
${abcContent}
`; + }); + + // Process LilyPond notation blocks + // LilyPond notation is typically in code blocks or between \relative and } + const lilypondPattern = /(\\relative[^}]+})/gs; + html = html.replace(lilypondPattern, (match) => { + const lilypondContent = match.trim(); + // Wrap in a div for LilyPond rendering + return `
${lilypondContent}
`; + }); + + // Process inline chord notation: [C], [Am], [F#m7], etc. + const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g; + html = html.replace(chordPattern, (match, chord) => { + // Wrap in a span for chord rendering + return `[${chord}]`; + }); + + // Process MusicXML-like notation (if present in content) + const musicxmlPattern = /(]*>.*?<\/music>)/gs; + html = html.replace(musicxmlPattern, (match) => { + const musicxmlContent = match.trim(); + // Wrap in a div for MusicXML rendering + return `
${musicxmlContent}
`; + }); + + return html; +} + +/** + * Escapes a string for use in HTML attributes + */ +function escapeForAttr(text: string): string { + return text + .replace(/"/g, '"') + .replace(/'/g, ''') + .replace(//g, '>') + .replace(/\n/g, ' ') + .replace(/\r/g, ''); +} diff --git a/src/processors/nostr.ts b/src/processors/nostr.ts new file mode 100644 index 0000000..5ea176c --- /dev/null +++ b/src/processors/nostr.ts @@ -0,0 +1,28 @@ +/** + * Processes nostr: prefixed addresses + */ +export function processNostrAddresses(content: string, linkBaseURL: string): string { + // Pattern: nostr:naddr1..., nostr:nevent1..., nostr:note1..., nostr:npub1..., nostr:nprofile1... + const nostrPattern = /nostr:([a-z0-9]+[a-z0-9]{1,})/g; + + return content.replace(nostrPattern, (match, nostrID) => { + // If linkBaseURL is set, convert to a link + if (linkBaseURL) { + // Determine the type and create appropriate link + if (nostrID.startsWith('naddr')) { + return `
${match}`; + } else if (nostrID.startsWith('nevent')) { + return `${match}`; + } else if (nostrID.startsWith('note')) { + return `${match}`; + } else if (nostrID.startsWith('npub')) { + return `${match}`; + } else if (nostrID.startsWith('nprofile')) { + return `${match}`; + } + } + + // Return as a span with class for styling + return `${match}`; + }); +} diff --git a/src/processors/plain.ts b/src/processors/plain.ts new file mode 100644 index 0000000..7e466e5 --- /dev/null +++ b/src/processors/plain.ts @@ -0,0 +1,42 @@ +import { ProcessResult } from '../types'; + +/** + * Escapes HTML special characters + */ +function escapeHTML(text: string): string { + return text + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + +/** + * Processes plain text content with basic formatting + */ +export function processPlainText(text: string): ProcessResult { + // Escape HTML + let html = escapeHTML(text); + + // Convert line breaks to
+ html = html.replace(/\n/g, '
\n'); + + // Convert double line breaks to paragraphs + const paragraphs = html.split('
\n
\n'); + const result: string[] = []; + + for (const para of paragraphs) { + const trimmed = para.trim(); + if (trimmed) { + result.push(`

${trimmed}

`); + } + } + + return { + content: result.join('\n'), + tableOfContents: '', + hasLaTeX: false, + hasMusicalNotation: false, + }; +} diff --git a/src/types.ts b/src/types.ts new file mode 100644 index 0000000..3753f1d --- /dev/null +++ b/src/types.ts @@ -0,0 +1,43 @@ +/** + * Options for configuring the parser behavior + */ +export interface ParserOptions { + /** Base URL for rewriting relative links and nostr: addresses */ + linkBaseURL?: string; + /** Enable AsciiDoc processing (default: true) */ + enableAsciiDoc?: boolean; + /** Enable Markdown processing (default: true) */ + enableMarkdown?: boolean; + /** Enable code syntax highlighting (default: true) */ + enableCodeHighlighting?: boolean; + /** Enable LaTeX math rendering (default: true) */ + enableLaTeX?: boolean; + /** Enable musical notation rendering (default: true) */ + enableMusicalNotation?: boolean; + /** Enable nostr: address processing (default: true) */ + enableNostrAddresses?: boolean; +} + +/** + * Result of processing content + */ +export interface ProcessResult { + /** Main processed HTML content */ + content: string; + /** Extracted table of contents (for AsciiDoc) */ + tableOfContents: string; + /** Indicates if LaTeX content was found */ + hasLaTeX: boolean; + /** Indicates if musical notation was found */ + hasMusicalNotation: boolean; +} + +/** + * Detected content format + */ +export enum ContentFormat { + Unknown = 'unknown', + AsciiDoc = 'asciidoc', + Markdown = 'markdown', + Plain = 'plain' +} diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..2f8cd7c --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,19 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "commonjs", + "lib": ["ES2020"], + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist", "**/*.test.ts"] +}