Browse Source

bug-fixes

master
Silberengel 2 weeks ago
parent
commit
9708d879b4
  1. 4
      README.md
  2. 22
      src/converters/to-asciidoc.ts
  3. 18
      src/detector.ts
  4. 73
      src/extractors/metadata.ts
  5. 13
      src/processors/asciidoc.ts
  6. 13
      src/processors/html-postprocess.ts
  7. 57
      src/processors/html-utils.ts
  8. 1
      tsconfig.json

4
README.md

@ -3,9 +3,7 @@ @@ -3,9 +3,7 @@
A super-parser for Nostr event content that handles multiple content formats including AsciiDoc, Markdown, code syntax highlighting, LaTeX, musical notation, and `nostr:` prefixed addresses.
Built with TypeScript/JavaScript using:
- **asciidoctor.js** for AsciiDoc processing
- **marked** for Markdown processing
- **highlight.js** for code syntax highlighting
- **@asciidoctor/core** for AsciiDoc processing (includes Markdown-to-AsciiDoc conversion and highlight.js integration)
## Features

22
src/converters/to-asciidoc.ts

@ -67,8 +67,7 @@ function convertMarkdownToAsciidoc(content: string): string { @@ -67,8 +67,7 @@ function convertMarkdownToAsciidoc(content: string): string {
asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 ==');
// Preserve nostr: addresses temporarily
asciidoc = asciidoc.replace(/nostr:([a-z0-9]+)/g, 'nostr:$1');
// Note: nostr: addresses are processed later in processNostrAddresses
// Convert headers
asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======');
@ -89,8 +88,8 @@ function convertMarkdownToAsciidoc(content: string): string { @@ -89,8 +88,8 @@ function convertMarkdownToAsciidoc(content: string): string {
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript
// Convert code blocks
asciidoc = asciidoc.replace(/```(\w+)?\n([\s\S]*?)\n```/g, (_match, lang, code) => {
// Convert code blocks (handle both \n and \r\n line endings)
asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => {
const trimmedCode = code.trim();
if (trimmedCode.length === 0) return '';
@ -211,11 +210,15 @@ function convertMarkdownToAsciidoc(content: string): string { @@ -211,11 +210,15 @@ function convertMarkdownToAsciidoc(content: string): string {
/**
* Converts plain text to AsciiDoc format
* Preserves line breaks by converting single newlines to line continuations
*/
function convertPlainTextToAsciidoc(content: string): string {
// Preserve double newlines (paragraph breaks)
// Convert single newlines to line continuations ( +\n)
return content
.replace(/\n\n/g, '\n\n')
.replace(/\n/g, ' +\n');
.replace(/\r\n/g, '\n') // Normalize line endings
.replace(/\n\n+/g, '\n\n') // Normalize multiple newlines to double
.replace(/([^\n])\n([^\n])/g, '$1 +\n$2'); // Single newlines become line continuations
}
/**
@ -254,10 +257,13 @@ function processWikilinks(content: string, linkBaseURL: string): string { @@ -254,10 +257,13 @@ function processWikilinks(content: string, linkBaseURL: string): string {
/**
* Processes nostr: addresses
* Converts to link:nostr:...[...] format
* Valid bech32 prefixes: npub, nprofile, nevent, naddr, note
*/
function processNostrAddresses(content: string, linkBaseURL: string): string {
// Match nostr: followed by valid bech32 string
return content.replace(/nostr:([a-z0-9]+[a-z0-9]{6,})/g, (_match, bech32Id) => {
// Match nostr: followed by valid bech32 prefix and identifier
// Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers)
const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi;
return content.replace(nostrPattern, (_match, bech32Id) => {
return `link:nostr:${bech32Id}[${bech32Id}]`;
});
}

18
src/detector.ts

@ -27,21 +27,19 @@ export function detectFormat(content: string): ContentFormat { @@ -27,21 +27,19 @@ export function detectFormat(content: string): ContentFormat {
}
}
// Check for Markdown indicators
// Check for Markdown indicators (more specific patterns to avoid false positives)
const markdownIndicators = [
'# ', // Heading
'## ', // Subheading
'```', // Code block
'**', // Bold
'*', // Italic or list
'- ', // List item
'![', // Image
'[', // Link
/^#{1,6}\s+/m, // Heading at start of line
/```[\s\S]*?```/, // Code block
/\*\*[^*]+\*\*/, // Bold text
/^[-*+]\s+/m, // List item at start of line
/!\[[^\]]*\]\([^)]+\)/, // Image syntax
/\[[^\]]+\]\([^)]+\)/, // Link syntax
];
let markdownScore = 0;
for (const indicator of markdownIndicators) {
if (content.includes(indicator)) {
if (indicator.test(content)) {
markdownScore++;
}
}

73
src/extractors/metadata.ts

@ -28,8 +28,8 @@ function extractNostrLinks(content: string): NostrLink[] { @@ -28,8 +28,8 @@ function extractNostrLinks(content: string): NostrLink[] {
const nostrLinks: NostrLink[] = [];
const seen = new Set<string>();
// Extract nostr: prefixed links
const nostrMatches = content.match(/nostr:([a-z0-9]+[a-z0-9]{6,})/g) || [];
// Extract nostr: prefixed links (valid bech32 format)
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || [];
nostrMatches.forEach(match => {
const id = match.substring(6); // Remove 'nostr:'
const type = getNostrType(id);
@ -79,20 +79,33 @@ function extractWikilinks(content: string): Wikilink[] { @@ -79,20 +79,33 @@ function extractWikilinks(content: string): Wikilink[] {
/**
* Extract hashtags from content
* Excludes hashtags in URLs, code blocks, and inline code
*/
function extractHashtags(content: string): string[] {
const hashtags: string[] = [];
const seen = new Set<string>();
// Extract hashtags: #hashtag
const hashtagMatches = content.match(/#([a-zA-Z0-9_]+)/g) || [];
hashtagMatches.forEach(match => {
const tag = match.substring(1).toLowerCase();
// Remove code blocks first to avoid matching inside them
const codeBlockPattern = /```[\s\S]*?```/g;
const inlineCodePattern = /`[^`]+`/g;
const urlPattern = /https?:\/\/[^\s<>"']+/g;
let processedContent = content
.replace(codeBlockPattern, '') // Remove code blocks
.replace(inlineCodePattern, '') // Remove inline code
.replace(urlPattern, ''); // Remove URLs
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g;
let match;
while ((match = hashtagPattern.exec(processedContent)) !== null) {
const tag = match[1].toLowerCase();
if (!seen.has(tag)) {
hashtags.push(tag);
seen.add(tag);
}
});
}
return hashtags;
}
@ -104,12 +117,11 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string @@ -104,12 +117,11 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
const links: Array<{ url: string; text: string; isExternal: boolean }> = [];
const seen = new Set<string>();
// Extract markdown links: [text](url)
const markdownLinks = content.match(/\[([^\]]+)\]\(([^)]+)\)/g) || [];
markdownLinks.forEach(match => {
const linkMatch = match.match(/\[([^\]]+)\]\(([^)]+)\)/);
if (linkMatch) {
const [, text, url] = linkMatch;
// Extract markdown links: [text](url) - optimized to avoid double matching
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
let markdownMatch;
while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) {
const [, text, url] = markdownMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
@ -119,14 +131,12 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string @@ -119,14 +131,12 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
});
}
}
});
// Extract asciidoc links: link:url[text]
const asciidocLinks = content.match(/link:([^\[]+)\[([^\]]+)\]/g) || [];
asciidocLinks.forEach(match => {
const linkMatch = match.match(/link:([^\[]+)\[([^\]]+)\]/);
if (linkMatch) {
const [, url, text] = linkMatch;
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g;
let asciidocMatch;
while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) {
const [, url, text] = asciidocMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
@ -136,7 +146,6 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string @@ -136,7 +146,6 @@ function extractLinks(content: string, linkBaseURL: string): Array<{ url: string
});
}
}
});
// Extract raw URLs (basic pattern)
const urlPattern = /https?:\/\/[^\s<>"']+/g;
@ -162,29 +171,31 @@ function extractMedia(content: string): string[] { @@ -162,29 +171,31 @@ function extractMedia(content: string): string[] {
const media: string[] = [];
const seen = new Set<string>();
// Extract markdown images: ![alt](url)
const imageMatches = content.match(/!\[[^\]]*\]\(([^)]+)\)/g) || [];
imageMatches.forEach(match => {
const url = match.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1];
// Extract markdown images: ![alt](url) - optimized to avoid double matching
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
let markdownImageMatch;
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) {
const url = markdownImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
});
}
// Extract asciidoc images: image::url[alt]
const asciidocImageMatches = content.match(/image::([^\[]+)\[/g) || [];
asciidocImageMatches.forEach(match => {
const url = match.match(/image::([^\[]+)\[/)?.[1];
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
const asciidocImagePattern = /image::([^\[]+)\[/g;
let asciidocImageMatch;
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) {
const url = asciidocImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
});
}
// Extract raw image/video URLs
const urlPattern = /https?:\/\/[^\s<>"']+/g;

13
src/processors/asciidoc.ts

@ -120,9 +120,18 @@ export async function processAsciidoc( @@ -120,9 +120,18 @@ export async function processAsciidoc(
media: [],
};
} catch (error) {
// Fallback to plain text
// Fallback to plain text with error logging
const errorMessage = error instanceof Error ? error.message : String(error);
// Use process.stderr.write for Node.js compatibility instead of console.error
if (typeof process !== 'undefined' && process.stderr) {
process.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`);
}
// Escape HTML in content for safe display
const escapedContent = sanitizeHTML(content);
return {
content: `<p>${sanitizeHTML(content)}</p>`,
content: `<p>${escapedContent}</p>`,
tableOfContents: '',
hasLaTeX: false,
hasMusicalNotation: false,

13
src/processors/html-postprocess.ts

@ -19,7 +19,16 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}): @@ -19,7 +19,16 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):
// Convert hashtag links to HTML
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
return `<a href="/notes?t=${normalizedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${displayText}</a>`;
// URL encode the hashtag to prevent XSS
const encodedHashtag = encodeURIComponent(normalizedHashtag);
// HTML escape the display text
const escapedDisplay = displayText
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
return `<a href="/notes?t=${encodedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${escapedDisplay}</a>`;
});
// Convert wikilink:dtag[display] format to HTML
@ -105,7 +114,7 @@ function processImages(html: string): string { @@ -105,7 +114,7 @@ function processImages(html: string): string {
let updatedAttributes = attributes;
if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => {
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`

57
src/processors/html-utils.ts

@ -32,14 +32,14 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri @@ -32,14 +32,14 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
return { toc: '', contentWithoutTOC: html };
}
// Find the matching closing tag by counting div tags
// Find the matching closing tag by counting div/nav tags
const searchStart = tocStartIdx + tocStartTag.length;
let depth = 1;
let i = searchStart;
while (i < html.length && depth > 0) {
// Look for opening or closing div/nav tags
if (i + 4 < html.length && html.substring(i, i + 4) === '<div') {
if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') {
// Check if it's a closing tag
if (i + 5 < html.length && html[i + 4] === '/') {
depth--;
@ -47,25 +47,35 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri @@ -47,25 +47,35 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
if (closeIdx === -1) break;
i = closeIdx + 1;
} else {
// Opening tag - find the end
// Opening tag - find the end (handle attributes and self-closing)
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
// Check if it's self-closing
if (html[closeIdx - 1] !== '/') {
// Check if it's self-closing (look for /> before the >)
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
}
} else if (i + 5 < html.length && html.substring(i, i + 5) === '</div') {
} else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else if (i + 5 < html.length && html.substring(i, i + 5) === '</nav') {
} else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') {
// Handle opening nav tags
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
} else {
i++;
}
@ -119,17 +129,32 @@ export function sanitizeHTML(html: string): string { @@ -119,17 +129,32 @@ export function sanitizeHTML(html: string): string {
/**
* Processes HTML links to add target="_blank" to external links
* This function is available for use but not currently called automatically.
* It can be used in post-processing if needed.
*/
export function processLinks(html: string, linkBaseURL: string): string {
// Extract domain from linkBaseURL for comparison
let linkBaseDomain = '';
if (linkBaseURL) {
try {
// Use URL constructor if available (Node.js 10+)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const url = new URLConstructor(linkBaseURL);
linkBaseDomain = url.hostname;
} else {
throw new Error('URL not available');
}
} catch {
// Fallback to simple string parsing if URL constructor fails
const url = linkBaseURL.replace(/^https?:\/\//, '');
const parts = url.split('/');
if (parts.length > 0) {
linkBaseDomain = parts[0];
}
}
}
// Regex to match <a> tags with href attributes
const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g;
@ -140,10 +165,26 @@ export function processLinks(html: string, linkBaseURL: string): string { @@ -140,10 +165,26 @@ export function processLinks(html: string, linkBaseURL: string): string {
if (isExternal) {
// Check if it's pointing to our own domain
if (linkBaseDomain && href.includes(linkBaseDomain)) {
if (linkBaseDomain) {
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const hrefUrl = new URLConstructor(href);
if (hrefUrl.hostname === linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
} else {
throw new Error('URL not available');
}
} catch {
// If URL parsing fails, use simple string check
if (href.includes(linkBaseDomain)) {
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
}
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
if (!match.includes('target=')) {

1
tsconfig.json

@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
"target": "ES2020",
"module": "commonjs",
"lib": ["ES2020"],
"types": ["node"],
"outDir": "./dist",
"rootDir": "./src",
"strict": true,

Loading…
Cancel
Save