diff --git a/README.md b/README.md index d3388e9..225669c 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,7 @@ A super-parser for Nostr event content that handles multiple content formats including AsciiDoc, Markdown, code syntax highlighting, LaTeX, musical notation, and `nostr:` prefixed addresses. Built with TypeScript/JavaScript using: -- **asciidoctor.js** for AsciiDoc processing -- **marked** for Markdown processing -- **highlight.js** for code syntax highlighting +- **@asciidoctor/core** for AsciiDoc processing (includes Markdown-to-AsciiDoc conversion and highlight.js integration) ## Features diff --git a/src/converters/to-asciidoc.ts b/src/converters/to-asciidoc.ts index 779d864..ca03ae6 100644 --- a/src/converters/to-asciidoc.ts +++ b/src/converters/to-asciidoc.ts @@ -67,8 +67,7 @@ function convertMarkdownToAsciidoc(content: string): string { asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2'); asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 =='); - // Preserve nostr: addresses temporarily - asciidoc = asciidoc.replace(/nostr:([a-z0-9]+)/g, 'nostr:$1'); + // Note: nostr: addresses are processed later in processNostrAddresses // Convert headers asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======'); @@ -89,8 +88,8 @@ function convertMarkdownToAsciidoc(content: string): string { asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript - // Convert code blocks - asciidoc = asciidoc.replace(/```(\w+)?\n([\s\S]*?)\n```/g, (_match, lang, code) => { + // Convert code blocks (handle both \n and \r\n line endings) + asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => { const trimmedCode = code.trim(); if (trimmedCode.length === 0) return ''; @@ -211,11 +210,15 @@ function convertMarkdownToAsciidoc(content: string): string { /** * Converts plain text to AsciiDoc format + * Preserves line breaks by converting single newlines to line continuations */ function convertPlainTextToAsciidoc(content: string): string { + // Preserve double newlines (paragraph breaks) + // Convert single newlines to line continuations ( +\n) return content - .replace(/\n\n/g, '\n\n') - .replace(/\n/g, ' +\n'); + .replace(/\r\n/g, '\n') // Normalize line endings + .replace(/\n\n+/g, '\n\n') // Normalize multiple newlines to double + .replace(/([^\n])\n([^\n])/g, '$1 +\n$2'); // Single newlines become line continuations } /** @@ -254,10 +257,13 @@ function processWikilinks(content: string, linkBaseURL: string): string { /** * Processes nostr: addresses * Converts to link:nostr:...[...] format + * Valid bech32 prefixes: npub, nprofile, nevent, naddr, note */ function processNostrAddresses(content: string, linkBaseURL: string): string { - // Match nostr: followed by valid bech32 string - return content.replace(/nostr:([a-z0-9]+[a-z0-9]{6,})/g, (_match, bech32Id) => { + // Match nostr: followed by valid bech32 prefix and identifier + // Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers) + const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi; + return content.replace(nostrPattern, (_match, bech32Id) => { return `link:nostr:${bech32Id}[${bech32Id}]`; }); } diff --git a/src/detector.ts b/src/detector.ts index e2eb6db..3374e12 100644 --- a/src/detector.ts +++ b/src/detector.ts @@ -27,21 +27,19 @@ export function detectFormat(content: string): ContentFormat { } } - // Check for Markdown indicators + // Check for Markdown indicators (more specific patterns to avoid false positives) const markdownIndicators = [ - '# ', // Heading - '## ', // Subheading - '```', // Code block - '**', // Bold - '*', // Italic or list - '- ', // List item - '![', // Image - '[', // Link + /^#{1,6}\s+/m, // Heading at start of line + /```[\s\S]*?```/, // Code block + /\*\*[^*]+\*\*/, // Bold text + /^[-*+]\s+/m, // List item at start of line + /!\[[^\]]*\]\([^)]+\)/, // Image syntax + /\[[^\]]+\]\([^)]+\)/, // Link syntax ]; let markdownScore = 0; for (const indicator of markdownIndicators) { - if (content.includes(indicator)) { + if (indicator.test(content)) { markdownScore++; } } diff --git a/src/extractors/metadata.ts b/src/extractors/metadata.ts index 35d0911..89009ff 100644 --- a/src/extractors/metadata.ts +++ b/src/extractors/metadata.ts @@ -28,8 +28,8 @@ function extractNostrLinks(content: string): NostrLink[] { const nostrLinks: NostrLink[] = []; const seen = new Set(); - // Extract nostr: prefixed links - const nostrMatches = content.match(/nostr:([a-z0-9]+[a-z0-9]{6,})/g) || []; + // Extract nostr: prefixed links (valid bech32 format) + const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || []; nostrMatches.forEach(match => { const id = match.substring(6); // Remove 'nostr:' const type = getNostrType(id); @@ -79,20 +79,33 @@ function extractWikilinks(content: string): Wikilink[] { /** * Extract hashtags from content + * Excludes hashtags in URLs, code blocks, and inline code */ function extractHashtags(content: string): string[] { const hashtags: string[] = []; const seen = new Set(); - // Extract hashtags: #hashtag - const hashtagMatches = content.match(/#([a-zA-Z0-9_]+)/g) || []; - hashtagMatches.forEach(match => { - const tag = match.substring(1).toLowerCase(); + // Remove code blocks first to avoid matching inside them + const codeBlockPattern = /```[\s\S]*?```/g; + const inlineCodePattern = /`[^`]+`/g; + const urlPattern = /https?:\/\/[^\s<>"']+/g; + + let processedContent = content + .replace(codeBlockPattern, '') // Remove code blocks + .replace(inlineCodePattern, '') // Remove inline code + .replace(urlPattern, ''); // Remove URLs + + // Extract hashtags: #hashtag (word boundary to avoid matching in URLs) + const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g; + let match; + + while ((match = hashtagPattern.exec(processedContent)) !== null) { + const tag = match[1].toLowerCase(); if (!seen.has(tag)) { hashtags.push(tag); seen.add(tag); } - }); + } return hashtags; } @@ -104,39 +117,35 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string const links: Array<{ url: string; text: string; isExternal: boolean }> = []; const seen = new Set(); - // Extract markdown links: [text](url) - const markdownLinks = content.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []; - markdownLinks.forEach(match => { - const linkMatch = match.match(/\[([^\]]+)\]\(([^)]+)\)/); - if (linkMatch) { - const [, text, url] = linkMatch; - if (!seen.has(url) && !isNostrUrl(url)) { - seen.add(url); - links.push({ - url, - text, - isExternal: isExternalUrl(url, linkBaseURL), - }); - } + // Extract markdown links: [text](url) - optimized to avoid double matching + const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g; + let markdownMatch; + while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) { + const [, text, url] = markdownMatch; + if (!seen.has(url) && !isNostrUrl(url)) { + seen.add(url); + links.push({ + url, + text, + isExternal: isExternalUrl(url, linkBaseURL), + }); } - }); + } - // Extract asciidoc links: link:url[text] - const asciidocLinks = content.match(/link:([^\[]+)\[([^\]]+)\]/g) || []; - asciidocLinks.forEach(match => { - const linkMatch = match.match(/link:([^\[]+)\[([^\]]+)\]/); - if (linkMatch) { - const [, url, text] = linkMatch; - if (!seen.has(url) && !isNostrUrl(url)) { - seen.add(url); - links.push({ - url, - text, - isExternal: isExternalUrl(url, linkBaseURL), - }); - } + // Extract asciidoc links: link:url[text] - optimized to avoid double matching + const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g; + let asciidocMatch; + while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) { + const [, url, text] = asciidocMatch; + if (!seen.has(url) && !isNostrUrl(url)) { + seen.add(url); + links.push({ + url, + text, + isExternal: isExternalUrl(url, linkBaseURL), + }); } - }); + } // Extract raw URLs (basic pattern) const urlPattern = /https?:\/\/[^\s<>"']+/g; @@ -162,29 +171,31 @@ function extractMedia(content: string): string[] { const media: string[] = []; const seen = new Set(); - // Extract markdown images: ![alt](url) - const imageMatches = content.match(/!\[[^\]]*\]\(([^)]+)\)/g) || []; - imageMatches.forEach(match => { - const url = match.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1]; + // Extract markdown images: ![alt](url) - optimized to avoid double matching + const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g; + let markdownImageMatch; + while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) { + const url = markdownImageMatch[1]; if (url && !seen.has(url)) { if (isImageUrl(url) || isVideoUrl(url)) { media.push(url); seen.add(url); } } - }); + } - // Extract asciidoc images: image::url[alt] - const asciidocImageMatches = content.match(/image::([^\[]+)\[/g) || []; - asciidocImageMatches.forEach(match => { - const url = match.match(/image::([^\[]+)\[/)?.[1]; + // Extract asciidoc images: image::url[alt] - optimized to avoid double matching + const asciidocImagePattern = /image::([^\[]+)\[/g; + let asciidocImageMatch; + while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) { + const url = asciidocImageMatch[1]; if (url && !seen.has(url)) { if (isImageUrl(url) || isVideoUrl(url)) { media.push(url); seen.add(url); } } - }); + } // Extract raw image/video URLs const urlPattern = /https?:\/\/[^\s<>"']+/g; diff --git a/src/processors/asciidoc.ts b/src/processors/asciidoc.ts index bae2c02..5155214 100644 --- a/src/processors/asciidoc.ts +++ b/src/processors/asciidoc.ts @@ -120,9 +120,18 @@ export async function processAsciidoc( media: [], }; } catch (error) { - // Fallback to plain text + // Fallback to plain text with error logging + const errorMessage = error instanceof Error ? error.message : String(error); + // Use process.stderr.write for Node.js compatibility instead of console.error + if (typeof process !== 'undefined' && process.stderr) { + process.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`); + } + + // Escape HTML in content for safe display + const escapedContent = sanitizeHTML(content); + return { - content: `

${sanitizeHTML(content)}

`, + content: `

${escapedContent}

`, tableOfContents: '', hasLaTeX: false, hasMusicalNotation: false, diff --git a/src/processors/html-postprocess.ts b/src/processors/html-postprocess.ts index 194d1ae..79e1cbd 100644 --- a/src/processors/html-postprocess.ts +++ b/src/processors/html-postprocess.ts @@ -19,7 +19,16 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}): // Convert hashtag links to HTML processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => { - return `${displayText}`; + // URL encode the hashtag to prevent XSS + const encodedHashtag = encodeURIComponent(normalizedHashtag); + // HTML escape the display text + const escapedDisplay = displayText + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + return `${escapedDisplay}`; }); // Convert wikilink:dtag[display] format to HTML @@ -105,7 +114,7 @@ function processImages(html: string): string { let updatedAttributes = attributes; if (updatedAttributes.match(/class=["']/i)) { - updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => { + updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => { const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim(); const newClasses = cleanedClasses ? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in` diff --git a/src/processors/html-utils.ts b/src/processors/html-utils.ts index 2e56cba..10edbfe 100644 --- a/src/processors/html-utils.ts +++ b/src/processors/html-utils.ts @@ -32,14 +32,14 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri return { toc: '', contentWithoutTOC: html }; } - // Find the matching closing tag by counting div tags + // Find the matching closing tag by counting div/nav tags const searchStart = tocStartIdx + tocStartTag.length; let depth = 1; let i = searchStart; while (i < html.length && depth > 0) { // Look for opening or closing div/nav tags - if (i + 4 < html.length && html.substring(i, i + 4) === '', i); if (closeIdx === -1) break; - // Check if it's self-closing - if (html[closeIdx - 1] !== '/') { + // Check if it's self-closing (look for /> before the >) + const tagContent = html.substring(i, closeIdx); + if (!tagContent.endsWith('/')) { depth++; } i = closeIdx + 1; } - } else if (i + 5 < html.length && html.substring(i, i + 5) === '', i); if (closeIdx === -1) break; i = closeIdx + 1; - } else if (i + 5 < html.length && html.substring(i, i + 5) === '', i); if (closeIdx === -1) break; i = closeIdx + 1; + } else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '', i); + if (closeIdx === -1) break; + const tagContent = html.substring(i, closeIdx); + if (!tagContent.endsWith('/')) { + depth++; + } + i = closeIdx + 1; } else { i++; } @@ -119,15 +129,30 @@ export function sanitizeHTML(html: string): string { /** * Processes HTML links to add target="_blank" to external links + * This function is available for use but not currently called automatically. + * It can be used in post-processing if needed. */ export function processLinks(html: string, linkBaseURL: string): string { // Extract domain from linkBaseURL for comparison let linkBaseDomain = ''; if (linkBaseURL) { - const url = linkBaseURL.replace(/^https?:\/\//, ''); - const parts = url.split('/'); - if (parts.length > 0) { - linkBaseDomain = parts[0]; + try { + // Use URL constructor if available (Node.js 10+) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const URLConstructor = (globalThis as any).URL; + if (URLConstructor) { + const url = new URLConstructor(linkBaseURL); + linkBaseDomain = url.hostname; + } else { + throw new Error('URL not available'); + } + } catch { + // Fallback to simple string parsing if URL constructor fails + const url = linkBaseURL.replace(/^https?:\/\//, ''); + const parts = url.split('/'); + if (parts.length > 0) { + linkBaseDomain = parts[0]; + } } } @@ -140,9 +165,25 @@ export function processLinks(html: string, linkBaseURL: string): string { if (isExternal) { // Check if it's pointing to our own domain - if (linkBaseDomain && href.includes(linkBaseDomain)) { - // Same domain - open in same tab (remove any existing target attribute) - return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); + if (linkBaseDomain) { + try { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const URLConstructor = (globalThis as any).URL; + if (URLConstructor) { + const hrefUrl = new URLConstructor(href); + if (hrefUrl.hostname === linkBaseDomain) { + // Same domain - open in same tab (remove any existing target attribute) + return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); + } + } else { + throw new Error('URL not available'); + } + } catch { + // If URL parsing fails, use simple string check + if (href.includes(linkBaseDomain)) { + return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); + } + } } // External link - add target="_blank" and rel="noopener noreferrer" if not already present diff --git a/tsconfig.json b/tsconfig.json index 2f8cd7c..4aed97e 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -3,6 +3,7 @@ "target": "ES2020", "module": "commonjs", "lib": ["ES2020"], + "types": ["node"], "outDir": "./dist", "rootDir": "./src", "strict": true,