Browse Source

bug-fixes

master
Silberengel 2 weeks ago
parent
commit
9708d879b4
  1. 4
      README.md
  2. 22
      src/converters/to-asciidoc.ts
  3. 18
      src/detector.ts
  4. 105
      src/extractors/metadata.ts
  5. 13
      src/processors/asciidoc.ts
  6. 13
      src/processors/html-postprocess.ts
  7. 69
      src/processors/html-utils.ts
  8. 1
      tsconfig.json

4
README.md

@ -3,9 +3,7 @@
A super-parser for Nostr event content that handles multiple content formats including AsciiDoc, Markdown, code syntax highlighting, LaTeX, musical notation, and `nostr:` prefixed addresses. A super-parser for Nostr event content that handles multiple content formats including AsciiDoc, Markdown, code syntax highlighting, LaTeX, musical notation, and `nostr:` prefixed addresses.
Built with TypeScript/JavaScript using: Built with TypeScript/JavaScript using:
- **asciidoctor.js** for AsciiDoc processing - **@asciidoctor/core** for AsciiDoc processing (includes Markdown-to-AsciiDoc conversion and highlight.js integration)
- **marked** for Markdown processing
- **highlight.js** for code syntax highlighting
## Features ## Features

22
src/converters/to-asciidoc.ts

@ -67,8 +67,7 @@ function convertMarkdownToAsciidoc(content: string): string {
asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2'); asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 =='); asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 ==');
// Preserve nostr: addresses temporarily // Note: nostr: addresses are processed later in processNostrAddresses
asciidoc = asciidoc.replace(/nostr:([a-z0-9]+)/g, 'nostr:$1');
// Convert headers // Convert headers
asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======'); asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======');
@ -89,8 +88,8 @@ function convertMarkdownToAsciidoc(content: string): string {
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript
// Convert code blocks // Convert code blocks (handle both \n and \r\n line endings)
asciidoc = asciidoc.replace(/```(\w+)?\n([\s\S]*?)\n```/g, (_match, lang, code) => { asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => {
const trimmedCode = code.trim(); const trimmedCode = code.trim();
if (trimmedCode.length === 0) return ''; if (trimmedCode.length === 0) return '';
@ -211,11 +210,15 @@ function convertMarkdownToAsciidoc(content: string): string {
/** /**
* Converts plain text to AsciiDoc format * Converts plain text to AsciiDoc format
* Preserves line breaks by converting single newlines to line continuations
*/ */
function convertPlainTextToAsciidoc(content: string): string { function convertPlainTextToAsciidoc(content: string): string {
// Preserve double newlines (paragraph breaks)
// Convert single newlines to line continuations ( +\n)
return content return content
.replace(/\n\n/g, '\n\n') .replace(/\r\n/g, '\n') // Normalize line endings
.replace(/\n/g, ' +\n'); .replace(/\n\n+/g, '\n\n') // Normalize multiple newlines to double
.replace(/([^\n])\n([^\n])/g, '$1 +\n$2'); // Single newlines become line continuations
} }
/** /**
@ -254,10 +257,13 @@ function processWikilinks(content: string, linkBaseURL: string): string {
/** /**
* Processes nostr: addresses * Processes nostr: addresses
* Converts to link:nostr:...[...] format * Converts to link:nostr:...[...] format
* Valid bech32 prefixes: npub, nprofile, nevent, naddr, note
*/ */
function processNostrAddresses(content: string, linkBaseURL: string): string { function processNostrAddresses(content: string, linkBaseURL: string): string {
// Match nostr: followed by valid bech32 string // Match nostr: followed by valid bech32 prefix and identifier
return content.replace(/nostr:([a-z0-9]+[a-z0-9]{6,})/g, (_match, bech32Id) => { // Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers)
const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi;
return content.replace(nostrPattern, (_match, bech32Id) => {
return `link:nostr:${bech32Id}[${bech32Id}]`; return `link:nostr:${bech32Id}[${bech32Id}]`;
}); });
} }

18
src/detector.ts

@ -27,21 +27,19 @@ export function detectFormat(content: string): ContentFormat {
} }
} }
// Check for Markdown indicators // Check for Markdown indicators (more specific patterns to avoid false positives)
const markdownIndicators = [ const markdownIndicators = [
'# ', // Heading /^#{1,6}\s+/m, // Heading at start of line
'## ', // Subheading /```[\s\S]*?```/, // Code block
'```', // Code block /\*\*[^*]+\*\*/, // Bold text
'**', // Bold /^[-*+]\s+/m, // List item at start of line
'*', // Italic or list /!\[[^\]]*\]\([^)]+\)/, // Image syntax
'- ', // List item /\[[^\]]+\]\([^)]+\)/, // Link syntax
'![', // Image
'[', // Link
]; ];
let markdownScore = 0; let markdownScore = 0;
for (const indicator of markdownIndicators) { for (const indicator of markdownIndicators) {
if (content.includes(indicator)) { if (indicator.test(content)) {
markdownScore++; markdownScore++;
} }
} }

105
src/extractors/metadata.ts

@ -28,8 +28,8 @@ function extractNostrLinks(content: string): NostrLink[] {
const nostrLinks: NostrLink[] = []; const nostrLinks: NostrLink[] = [];
const seen = new Set<string>(); const seen = new Set<string>();
// Extract nostr: prefixed links // Extract nostr: prefixed links (valid bech32 format)
const nostrMatches = content.match(/nostr:([a-z0-9]+[a-z0-9]{6,})/g) || []; const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || [];
nostrMatches.forEach(match => { nostrMatches.forEach(match => {
const id = match.substring(6); // Remove 'nostr:' const id = match.substring(6); // Remove 'nostr:'
const type = getNostrType(id); const type = getNostrType(id);
@ -79,20 +79,33 @@ function extractWikilinks(content: string): Wikilink[] {
/** /**
* Extract hashtags from content * Extract hashtags from content
* Excludes hashtags in URLs, code blocks, and inline code
*/ */
function extractHashtags(content: string): string[] { function extractHashtags(content: string): string[] {
const hashtags: string[] = []; const hashtags: string[] = [];
const seen = new Set<string>(); const seen = new Set<string>();
// Extract hashtags: #hashtag // Remove code blocks first to avoid matching inside them
const hashtagMatches = content.match(/#([a-zA-Z0-9_]+)/g) || []; const codeBlockPattern = /```[\s\S]*?```/g;
hashtagMatches.forEach(match => { const inlineCodePattern = /`[^`]+`/g;
const tag = match.substring(1).toLowerCase(); const urlPattern = /https?:\/\/[^\s<>"']+/g;
let processedContent = content
.replace(codeBlockPattern, '') // Remove code blocks
.replace(inlineCodePattern, '') // Remove inline code
.replace(urlPattern, ''); // Remove URLs
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g;
let match;
while ((match = hashtagPattern.exec(processedContent)) !== null) {
const tag = match[1].toLowerCase();
if (!seen.has(tag)) { if (!seen.has(tag)) {
hashtags.push(tag); hashtags.push(tag);
seen.add(tag); seen.add(tag);
} }
}); }
return hashtags; return hashtags;
} }
@ -104,39 +117,35 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
const links: Array<{ url: string; text: string; isExternal: boolean }> = []; const links: Array<{ url: string; text: string; isExternal: boolean }> = [];
const seen = new Set<string>(); const seen = new Set<string>();
// Extract markdown links: [text](url) // Extract markdown links: [text](url) - optimized to avoid double matching
const markdownLinks = content.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []; const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
markdownLinks.forEach(match => { let markdownMatch;
const linkMatch = match.match(/\[([^\]]+)\]\(([^)]+)\)/); while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) {
if (linkMatch) { const [, text, url] = markdownMatch;
const [, text, url] = linkMatch; if (!seen.has(url) && !isNostrUrl(url)) {
if (!seen.has(url) && !isNostrUrl(url)) { seen.add(url);
seen.add(url); links.push({
links.push({ url,
url, text,
text, isExternal: isExternalUrl(url, linkBaseURL),
isExternal: isExternalUrl(url, linkBaseURL), });
});
}
} }
}); }
// Extract asciidoc links: link:url[text] // Extract asciidoc links: link:url[text] - optimized to avoid double matching
const asciidocLinks = content.match(/link:([^\[]+)\[([^\]]+)\]/g) || []; const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g;
asciidocLinks.forEach(match => { let asciidocMatch;
const linkMatch = match.match(/link:([^\[]+)\[([^\]]+)\]/); while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) {
if (linkMatch) { const [, url, text] = asciidocMatch;
const [, url, text] = linkMatch; if (!seen.has(url) && !isNostrUrl(url)) {
if (!seen.has(url) && !isNostrUrl(url)) { seen.add(url);
seen.add(url); links.push({
links.push({ url,
url, text,
text, isExternal: isExternalUrl(url, linkBaseURL),
isExternal: isExternalUrl(url, linkBaseURL), });
});
}
} }
}); }
// Extract raw URLs (basic pattern) // Extract raw URLs (basic pattern)
const urlPattern = /https?:\/\/[^\s<>"']+/g; const urlPattern = /https?:\/\/[^\s<>"']+/g;
@ -162,29 +171,31 @@ function extractMedia(content: string): string[] {
const media: string[] = []; const media: string[] = [];
const seen = new Set<string>(); const seen = new Set<string>();
// Extract markdown images: ![alt](url) // Extract markdown images: ![alt](url) - optimized to avoid double matching
const imageMatches = content.match(/!\[[^\]]*\]\(([^)]+)\)/g) || []; const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
imageMatches.forEach(match => { let markdownImageMatch;
const url = match.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1]; while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) {
const url = markdownImageMatch[1];
if (url && !seen.has(url)) { if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) { if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url); media.push(url);
seen.add(url); seen.add(url);
} }
} }
}); }
// Extract asciidoc images: image::url[alt] // Extract asciidoc images: image::url[alt] - optimized to avoid double matching
const asciidocImageMatches = content.match(/image::([^\[]+)\[/g) || []; const asciidocImagePattern = /image::([^\[]+)\[/g;
asciidocImageMatches.forEach(match => { let asciidocImageMatch;
const url = match.match(/image::([^\[]+)\[/)?.[1]; while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) {
const url = asciidocImageMatch[1];
if (url && !seen.has(url)) { if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) { if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url); media.push(url);
seen.add(url); seen.add(url);
} }
} }
}); }
// Extract raw image/video URLs // Extract raw image/video URLs
const urlPattern = /https?:\/\/[^\s<>"']+/g; const urlPattern = /https?:\/\/[^\s<>"']+/g;

13
src/processors/asciidoc.ts

@ -120,9 +120,18 @@ export async function processAsciidoc(
media: [], media: [],
}; };
} catch (error) { } catch (error) {
// Fallback to plain text // Fallback to plain text with error logging
const errorMessage = error instanceof Error ? error.message : String(error);
// Use process.stderr.write for Node.js compatibility instead of console.error
if (typeof process !== 'undefined' && process.stderr) {
process.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`);
}
// Escape HTML in content for safe display
const escapedContent = sanitizeHTML(content);
return { return {
content: `<p>${sanitizeHTML(content)}</p>`, content: `<p>${escapedContent}</p>`,
tableOfContents: '', tableOfContents: '',
hasLaTeX: false, hasLaTeX: false,
hasMusicalNotation: false, hasMusicalNotation: false,

13
src/processors/html-postprocess.ts

@ -19,7 +19,16 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):
// Convert hashtag links to HTML // Convert hashtag links to HTML
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => { processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
return `<a href="/notes?t=${normalizedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${displayText}</a>`; // URL encode the hashtag to prevent XSS
const encodedHashtag = encodeURIComponent(normalizedHashtag);
// HTML escape the display text
const escapedDisplay = displayText
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
return `<a href="/notes?t=${encodedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${escapedDisplay}</a>`;
}); });
// Convert wikilink:dtag[display] format to HTML // Convert wikilink:dtag[display] format to HTML
@ -105,7 +114,7 @@ function processImages(html: string): string {
let updatedAttributes = attributes; let updatedAttributes = attributes;
if (updatedAttributes.match(/class=["']/i)) { if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => { updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => {
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim(); const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
const newClasses = cleanedClasses const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in` ? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`

69
src/processors/html-utils.ts

@ -32,14 +32,14 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
return { toc: '', contentWithoutTOC: html }; return { toc: '', contentWithoutTOC: html };
} }
// Find the matching closing tag by counting div tags // Find the matching closing tag by counting div/nav tags
const searchStart = tocStartIdx + tocStartTag.length; const searchStart = tocStartIdx + tocStartTag.length;
let depth = 1; let depth = 1;
let i = searchStart; let i = searchStart;
while (i < html.length && depth > 0) { while (i < html.length && depth > 0) {
// Look for opening or closing div/nav tags // Look for opening or closing div/nav tags
if (i + 4 < html.length && html.substring(i, i + 4) === '<div') { if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') {
// Check if it's a closing tag // Check if it's a closing tag
if (i + 5 < html.length && html[i + 4] === '/') { if (i + 5 < html.length && html[i + 4] === '/') {
depth--; depth--;
@ -47,25 +47,35 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
if (closeIdx === -1) break; if (closeIdx === -1) break;
i = closeIdx + 1; i = closeIdx + 1;
} else { } else {
// Opening tag - find the end // Opening tag - find the end (handle attributes and self-closing)
const closeIdx = html.indexOf('>', i); const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break; if (closeIdx === -1) break;
// Check if it's self-closing // Check if it's self-closing (look for /> before the >)
if (html[closeIdx - 1] !== '/') { const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++; depth++;
} }
i = closeIdx + 1; i = closeIdx + 1;
} }
} else if (i + 5 < html.length && html.substring(i, i + 5) === '</div') { } else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') {
depth--; depth--;
const closeIdx = html.indexOf('>', i); const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break; if (closeIdx === -1) break;
i = closeIdx + 1; i = closeIdx + 1;
} else if (i + 5 < html.length && html.substring(i, i + 5) === '</nav') { } else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') {
depth--; depth--;
const closeIdx = html.indexOf('>', i); const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break; if (closeIdx === -1) break;
i = closeIdx + 1; i = closeIdx + 1;
} else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') {
// Handle opening nav tags
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
} else { } else {
i++; i++;
} }
@ -119,15 +129,30 @@ export function sanitizeHTML(html: string): string {
/** /**
* Processes HTML links to add target="_blank" to external links * Processes HTML links to add target="_blank" to external links
* This function is available for use but not currently called automatically.
* It can be used in post-processing if needed.
*/ */
export function processLinks(html: string, linkBaseURL: string): string { export function processLinks(html: string, linkBaseURL: string): string {
// Extract domain from linkBaseURL for comparison // Extract domain from linkBaseURL for comparison
let linkBaseDomain = ''; let linkBaseDomain = '';
if (linkBaseURL) { if (linkBaseURL) {
const url = linkBaseURL.replace(/^https?:\/\//, ''); try {
const parts = url.split('/'); // Use URL constructor if available (Node.js 10+)
if (parts.length > 0) { // eslint-disable-next-line @typescript-eslint/no-explicit-any
linkBaseDomain = parts[0]; const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const url = new URLConstructor(linkBaseURL);
linkBaseDomain = url.hostname;
} else {
throw new Error('URL not available');
}
} catch {
// Fallback to simple string parsing if URL constructor fails
const url = linkBaseURL.replace(/^https?:\/\//, '');
const parts = url.split('/');
if (parts.length > 0) {
linkBaseDomain = parts[0];
}
} }
} }
@ -140,9 +165,25 @@ export function processLinks(html: string, linkBaseURL: string): string {
if (isExternal) { if (isExternal) {
// Check if it's pointing to our own domain // Check if it's pointing to our own domain
if (linkBaseDomain && href.includes(linkBaseDomain)) { if (linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute) try {
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); // eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const hrefUrl = new URLConstructor(href);
if (hrefUrl.hostname === linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
} else {
throw new Error('URL not available');
}
} catch {
// If URL parsing fails, use simple string check
if (href.includes(linkBaseDomain)) {
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
} }
// External link - add target="_blank" and rel="noopener noreferrer" if not already present // External link - add target="_blank" and rel="noopener noreferrer" if not already present

1
tsconfig.json

@ -3,6 +3,7 @@
"target": "ES2020", "target": "ES2020",
"module": "commonjs", "module": "commonjs",
"lib": ["ES2020"], "lib": ["ES2020"],
"types": ["node"],
"outDir": "./dist", "outDir": "./dist",
"rootDir": "./src", "rootDir": "./src",
"strict": true, "strict": true,

Loading…
Cancel
Save