bug-fixes

3 months ago · 9708d879b4
8 changed files with 159 additions and 86 deletions
--- a/README.md
+++ b/README.md
@ -3,9 +3,7 @@
				@@ -3,9 +3,7 @@
 A super-parser for Nostr event content that handles multiple content formats including AsciiDoc, Markdown, code syntax highlighting, LaTeX, musical notation, and `nostr:` prefixed addresses.

 Built with TypeScript/JavaScript using:
- **asciidoctor.js** for AsciiDoc processing
- **marked** for Markdown processing
- **highlight.js** for code syntax highlighting
+- **@asciidoctor/core** for AsciiDoc processing (includes Markdown-to-AsciiDoc conversion and highlight.js integration)

 ## Features

--- a/src/converters/to-asciidoc.ts
+++ b/src/converters/to-asciidoc.ts
@ -67,8 +67,7 @@ function convertMarkdownToAsciidoc(content: string): string {
				@@ -67,8 +67,7 @@ function convertMarkdownToAsciidoc(content: string): string {
  asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2');
  asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 ==');

-  // Preserve nostr: addresses temporarily
-  asciidoc = asciidoc.replace(/nostr:([a-z0-9]+)/g, 'nostr:$1');
+  // Note: nostr: addresses are processed later in processNostrAddresses

  // Convert headers
  asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======');
@ -89,8 +88,8 @@ function convertMarkdownToAsciidoc(content: string): string {
				@@ -89,8 +88,8 @@ function convertMarkdownToAsciidoc(content: string): string {
  asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript
  asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript

-  // Convert code blocks
-  asciidoc = asciidoc.replace(/```(\w+)?\n([\s\S]*?)\n```/g, (_match, lang, code) => {
+  // Convert code blocks (handle both \n and \r\n line endings)
+  asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => {
    const trimmedCode = code.trim();
    if (trimmedCode.length === 0) return '';
    
@ -211,11 +210,15 @@ function convertMarkdownToAsciidoc(content: string): string {
				@@ -211,11 +210,15 @@ function convertMarkdownToAsciidoc(content: string): string {

 /**
 * Converts plain text to AsciiDoc format
+ * Preserves line breaks by converting single newlines to line continuations
 */
 function convertPlainTextToAsciidoc(content: string): string {
+  // Preserve double newlines (paragraph breaks)
+  // Convert single newlines to line continuations ( +\n)
  return content
-    .replace(/\n\n/g, '\n\n')
-    .replace(/\n/g, ' +\n');
+    .replace(/\r\n/g, '\n') // Normalize line endings
+    .replace(/\n\n+/g, '\n\n') // Normalize multiple newlines to double
+    .replace(/([^\n])\n([^\n])/g, '$1 +\n$2'); // Single newlines become line continuations
 }

 /**
@ -254,10 +257,13 @@ function processWikilinks(content: string, linkBaseURL: string): string {
				@@ -254,10 +257,13 @@ function processWikilinks(content: string, linkBaseURL: string): string {
 /**
 * Processes nostr: addresses
 * Converts to link:nostr:...[...] format
+ * Valid bech32 prefixes: npub, nprofile, nevent, naddr, note
 */
 function processNostrAddresses(content: string, linkBaseURL: string): string {
-  // Match nostr: followed by valid bech32 string
-  return content.replace(/nostr:([a-z0-9]+[a-z0-9]{6,})/g, (_match, bech32Id) => {
+  // Match nostr: followed by valid bech32 prefix and identifier
+  // Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers)
+  const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi;
+  return content.replace(nostrPattern, (_match, bech32Id) => {
    return `link:nostr:${bech32Id}[${bech32Id}]`;
  });
 }
--- a/src/detector.ts
+++ b/src/detector.ts
@ -27,21 +27,19 @@ export function detectFormat(content: string): ContentFormat {
				@@ -27,21 +27,19 @@ export function detectFormat(content: string): ContentFormat {
    }
  }

-  // Check for Markdown indicators
+  // Check for Markdown indicators (more specific patterns to avoid false positives)
  const markdownIndicators = [
-    '# ',           // Heading
-    '## ',          // Subheading
-    '```',          // Code block
-    '**',           // Bold
-    '*',            // Italic or list
-    '- ',           // List item
-    '![',           // Image
-    '[',            // Link
+    /^#{1,6}\s+/m,           // Heading at start of line
+    /```[\s\S]*?```/,        // Code block
+    /\*\*[^*]+\*\*/,         // Bold text
+    /^[-*+]\s+/m,            // List item at start of line
+    /!\[[^\]]*\]\([^)]+\)/,  // Image syntax
+    /\[[^\]]+\]\([^)]+\)/,   // Link syntax
  ];

  let markdownScore = 0;
  for (const indicator of markdownIndicators) {
-    if (content.includes(indicator)) {
+    if (indicator.test(content)) {
      markdownScore++;
    }
  }
--- a/src/extractors/metadata.ts
+++ b/src/extractors/metadata.ts
@ -28,8 +28,8 @@ function extractNostrLinks(content: string): NostrLink[] {
				@@ -28,8 +28,8 @@ function extractNostrLinks(content: string): NostrLink[] {
  const nostrLinks: NostrLink[] = [];
  const seen = new Set<string>();

-  // Extract nostr: prefixed links
-  const nostrMatches = content.match(/nostr:([a-z0-9]+[a-z0-9]{6,})/g) || [];
+  // Extract nostr: prefixed links (valid bech32 format)
+  const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || [];
  nostrMatches.forEach(match => {
    const id = match.substring(6); // Remove 'nostr:'
    const type = getNostrType(id);
@ -79,20 +79,33 @@ function extractWikilinks(content: string): Wikilink[] {
				@@ -79,20 +79,33 @@ function extractWikilinks(content: string): Wikilink[] {

 /**
 * Extract hashtags from content
+ * Excludes hashtags in URLs, code blocks, and inline code
 */
 function extractHashtags(content: string): string[] {
  const hashtags: string[] = [];
  const seen = new Set<string>();

-  // Extract hashtags: #hashtag
-  const hashtagMatches = content.match(/#([a-zA-Z0-9_]+)/g) || [];
-  hashtagMatches.forEach(match => {
-    const tag = match.substring(1).toLowerCase();
+  // Remove code blocks first to avoid matching inside them
+  const codeBlockPattern = /```[\s\S]*?```/g;
+  const inlineCodePattern = /`[^`]+`/g;
+  const urlPattern = /https?:\/\/[^\s<>"']+/g;
+  
+  let processedContent = content
+    .replace(codeBlockPattern, '') // Remove code blocks
+    .replace(inlineCodePattern, '') // Remove inline code
+    .replace(urlPattern, ''); // Remove URLs
+
+  // Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
+  const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g;
+  let match;
+  
+  while ((match = hashtagPattern.exec(processedContent)) !== null) {
+    const tag = match[1].toLowerCase();
    if (!seen.has(tag)) {
      hashtags.push(tag);
      seen.add(tag);
    }
-  });
+  }

  return hashtags;
 }
@ -104,12 +117,11 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
				@@ -104,12 +117,11 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
  const links: Array<{ url: string; text: string; isExternal: boolean }> = [];
  const seen = new Set<string>();

-  // Extract markdown links: [text](url)
-  const markdownLinks = content.match(/\[([^\]]+)\]\(([^)]+)\)/g) || [];
-  markdownLinks.forEach(match => {
-    const linkMatch = match.match(/\[([^\]]+)\]\(([^)]+)\)/);
-    if (linkMatch) {
-      const [, text, url] = linkMatch;
+  // Extract markdown links: [text](url) - optimized to avoid double matching
+  const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
+  let markdownMatch;
+  while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) {
+    const [, text, url] = markdownMatch;
    if (!seen.has(url) && !isNostrUrl(url)) {
      seen.add(url);
      links.push({
@ -119,14 +131,12 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
				@@ -119,14 +131,12 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
      });
    }
  }
-  });

-  // Extract asciidoc links: link:url[text]
-  const asciidocLinks = content.match(/link:([^\[]+)\[([^\]]+)\]/g) || [];
-  asciidocLinks.forEach(match => {
-    const linkMatch = match.match(/link:([^\[]+)\[([^\]]+)\]/);
-    if (linkMatch) {
-      const [, url, text] = linkMatch;
+  // Extract asciidoc links: link:url[text] - optimized to avoid double matching
+  const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g;
+  let asciidocMatch;
+  while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) {
+    const [, url, text] = asciidocMatch;
    if (!seen.has(url) && !isNostrUrl(url)) {
      seen.add(url);
      links.push({
@ -136,7 +146,6 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
				@@ -136,7 +146,6 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
      });
    }
  }
-  });

  // Extract raw URLs (basic pattern)
  const urlPattern = /https?:\/\/[^\s<>"']+/g;
@ -162,29 +171,31 @@ function extractMedia(content: string): string[] {
				@@ -162,29 +171,31 @@ function extractMedia(content: string): string[] {
  const media: string[] = [];
  const seen = new Set<string>();

-  // Extract markdown images: ![alt](url)
-  const imageMatches = content.match(/!\[[^\]]*\]\(([^)]+)\)/g) || [];
-  imageMatches.forEach(match => {
-    const url = match.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1];
+  // Extract markdown images: ![alt](url) - optimized to avoid double matching
+  const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
+  let markdownImageMatch;
+  while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) {
+    const url = markdownImageMatch[1];
    if (url && !seen.has(url)) {
      if (isImageUrl(url) || isVideoUrl(url)) {
        media.push(url);
        seen.add(url);
      }
    }
-  });
+  }

-  // Extract asciidoc images: image::url[alt]
-  const asciidocImageMatches = content.match(/image::([^\[]+)\[/g) || [];
-  asciidocImageMatches.forEach(match => {
-    const url = match.match(/image::([^\[]+)\[/)?.[1];
+  // Extract asciidoc images: image::url[alt] - optimized to avoid double matching
+  const asciidocImagePattern = /image::([^\[]+)\[/g;
+  let asciidocImageMatch;
+  while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) {
+    const url = asciidocImageMatch[1];
    if (url && !seen.has(url)) {
      if (isImageUrl(url) || isVideoUrl(url)) {
        media.push(url);
        seen.add(url);
      }
    }
-  });
+  }

  // Extract raw image/video URLs
  const urlPattern = /https?:\/\/[^\s<>"']+/g;
--- a/src/processors/asciidoc.ts
+++ b/src/processors/asciidoc.ts
@ -120,9 +120,18 @@ export async function processAsciidoc(
				@@ -120,9 +120,18 @@ export async function processAsciidoc(
      media: [],
    };
  } catch (error) {
-    // Fallback to plain text
+    // Fallback to plain text with error logging
+    const errorMessage = error instanceof Error ? error.message : String(error);
+    // Use process.stderr.write for Node.js compatibility instead of console.error
+    if (typeof process !== 'undefined' && process.stderr) {
+      process.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`);
+    }
+    
+    // Escape HTML in content for safe display
+    const escapedContent = sanitizeHTML(content);
+    
    return {
-      content: `<p>${sanitizeHTML(content)}</p>`,
+      content: `<p>${escapedContent}</p>`,
      tableOfContents: '',
      hasLaTeX: false,
      hasMusicalNotation: false,
--- a/src/processors/html-postprocess.ts
+++ b/src/processors/html-postprocess.ts
@ -19,7 +19,16 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):
				@@ -19,7 +19,16 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):

  // Convert hashtag links to HTML
  processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
-    return `<a href="/notes?t=${normalizedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${displayText}</a>`;
+    // URL encode the hashtag to prevent XSS
+    const encodedHashtag = encodeURIComponent(normalizedHashtag);
+    // HTML escape the display text
+    const escapedDisplay = displayText
+      .replace(/&/g, '&amp;')
+      .replace(/</g, '&lt;')
+      .replace(/>/g, '&gt;')
+      .replace(/"/g, '&quot;')
+      .replace(/'/g, '&#39;');
+    return `<a href="/notes?t=${encodedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${escapedDisplay}</a>`;
  });

  // Convert wikilink:dtag[display] format to HTML
@ -105,7 +114,7 @@ function processImages(html: string): string {
				@@ -105,7 +114,7 @@ function processImages(html: string): string {
    let updatedAttributes = attributes;
    
    if (updatedAttributes.match(/class=["']/i)) {
-      updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => {
+      updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => {
        const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
        const newClasses = cleanedClasses 
          ? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
--- a/src/processors/html-utils.ts
+++ b/src/processors/html-utils.ts
@ -32,14 +32,14 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
				@@ -32,14 +32,14 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
    return { toc: '', contentWithoutTOC: html };
  }

-  // Find the matching closing tag by counting div tags
+  // Find the matching closing tag by counting div/nav tags
  const searchStart = tocStartIdx + tocStartTag.length;
  let depth = 1;
  let i = searchStart;

  while (i < html.length && depth > 0) {
    // Look for opening or closing div/nav tags
-    if (i + 4 < html.length && html.substring(i, i + 4) === '<div') {
+    if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') {
      // Check if it's a closing tag
      if (i + 5 < html.length && html[i + 4] === '/') {
        depth--;
@ -47,25 +47,35 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
				@@ -47,25 +47,35 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
        if (closeIdx === -1) break;
        i = closeIdx + 1;
      } else {
-        // Opening tag - find the end
+        // Opening tag - find the end (handle attributes and self-closing)
        const closeIdx = html.indexOf('>', i);
        if (closeIdx === -1) break;
-        // Check if it's self-closing
-        if (html[closeIdx - 1] !== '/') {
+        // Check if it's self-closing (look for /> before the >)
+        const tagContent = html.substring(i, closeIdx);
+        if (!tagContent.endsWith('/')) {
          depth++;
        }
        i = closeIdx + 1;
      }
-    } else if (i + 5 < html.length && html.substring(i, i + 5) === '</div') {
+    } else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') {
      depth--;
      const closeIdx = html.indexOf('>', i);
      if (closeIdx === -1) break;
      i = closeIdx + 1;
-    } else if (i + 5 < html.length && html.substring(i, i + 5) === '</nav') {
+    } else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') {
      depth--;
      const closeIdx = html.indexOf('>', i);
      if (closeIdx === -1) break;
      i = closeIdx + 1;
+    } else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') {
+      // Handle opening nav tags
+      const closeIdx = html.indexOf('>', i);
+      if (closeIdx === -1) break;
+      const tagContent = html.substring(i, closeIdx);
+      if (!tagContent.endsWith('/')) {
+        depth++;
+      }
+      i = closeIdx + 1;
    } else {
      i++;
    }
@ -119,17 +129,32 @@ export function sanitizeHTML(html: string): string {
				@@ -119,17 +129,32 @@ export function sanitizeHTML(html: string): string {

 /**
 * Processes HTML links to add target="_blank" to external links
+ * This function is available for use but not currently called automatically.
+ * It can be used in post-processing if needed.
 */
 export function processLinks(html: string, linkBaseURL: string): string {
  // Extract domain from linkBaseURL for comparison
  let linkBaseDomain = '';
  if (linkBaseURL) {
+    try {
+      // Use URL constructor if available (Node.js 10+)
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const URLConstructor = (globalThis as any).URL;
+      if (URLConstructor) {
+        const url = new URLConstructor(linkBaseURL);
+        linkBaseDomain = url.hostname;
+      } else {
+        throw new Error('URL not available');
+      }
+    } catch {
+      // Fallback to simple string parsing if URL constructor fails
      const url = linkBaseURL.replace(/^https?:\/\//, '');
      const parts = url.split('/');
      if (parts.length > 0) {
        linkBaseDomain = parts[0];
      }
    }
+  }

  // Regex to match <a> tags with href attributes
  const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g;
@ -140,10 +165,26 @@ export function processLinks(html: string, linkBaseURL: string): string {
				@@ -140,10 +165,26 @@ export function processLinks(html: string, linkBaseURL: string): string {

    if (isExternal) {
      // Check if it's pointing to our own domain
-      if (linkBaseDomain && href.includes(linkBaseDomain)) {
+      if (linkBaseDomain) {
+        try {
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          const URLConstructor = (globalThis as any).URL;
+          if (URLConstructor) {
+            const hrefUrl = new URLConstructor(href);
+            if (hrefUrl.hostname === linkBaseDomain) {
              // Same domain - open in same tab (remove any existing target attribute)
              return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
            }
+          } else {
+            throw new Error('URL not available');
+          }
+        } catch {
+          // If URL parsing fails, use simple string check
+          if (href.includes(linkBaseDomain)) {
+            return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
+          }
+        }
+      }

      // External link - add target="_blank" and rel="noopener noreferrer" if not already present
      if (!match.includes('target=')) {
--- a/tsconfig.json
+++ b/tsconfig.json
@ -3,6 +3,7 @@
				@@ -3,6 +3,7 @@
    "target": "ES2020",
    "module": "commonjs",
    "lib": ["ES2020"],
+    "types": ["node"],
    "outDir": "./dist",
    "rootDir": "./src",
    "strict": true,