45 changed files with 1687 additions and 16369 deletions
@ -1,72 +0,0 @@
@@ -1,72 +0,0 @@
|
||||
|
||||
|
||||
This is a test unordered list with mixed bullets: |
||||
|
||||
* First item with a number 2. in it |
||||
* Second item |
||||
* Third item |
||||
* Indented item |
||||
* Indented item |
||||
* Fourth item |
||||
|
||||
|
||||
Another unordered list: |
||||
|
||||
* 1st item |
||||
* 2nd item |
||||
* third item containing _italic_ text |
||||
* indented item |
||||
* second indented item |
||||
* fourth item |
||||
|
||||
|
||||
This is a test ordered list with indented items: |
||||
|
||||
. First item |
||||
. Second item |
||||
. Third item |
||||
. Indented item |
||||
. Indented item |
||||
. Fourth item |
||||
|
||||
|
||||
Ordered list where everything has the same number: |
||||
|
||||
. First item |
||||
. Second item |
||||
. Third item |
||||
. Fourth item |
||||
|
||||
|
||||
Ordered list that is wrongly numbered: |
||||
|
||||
. First item |
||||
. Second item |
||||
. Third item |
||||
. Fourth item |
||||
|
||||
|
||||
This is a mixed list with indented items: |
||||
|
||||
. First item |
||||
. Second item |
||||
. Third item |
||||
|
||||
* Indented item |
||||
* Indented item |
||||
|
||||
. Fourth item |
||||
|
||||
|
||||
This is another mixed list with indented items: |
||||
|
||||
* First item |
||||
* Second item |
||||
* Third item |
||||
|
||||
. Indented item |
||||
. Indented item |
||||
|
||||
* Fourth item |
||||
|
||||
|
||||
@ -1,27 +0,0 @@
@@ -1,27 +0,0 @@
|
||||
import { convertToAsciidoc } from './src/converters/to-asciidoc'; |
||||
import { detectFormat } from './src/detector'; |
||||
import * as fs from 'fs'; |
||||
import * as path from 'path'; |
||||
|
||||
// Read just the list section from markdown test doc
|
||||
const markdownContent = fs.readFileSync( |
||||
path.join(__dirname, 'markdown_testdoc.md'), |
||||
'utf-8' |
||||
); |
||||
|
||||
// Extract just the list sections
|
||||
const listSection = markdownContent.split('## Bullet list')[1]?.split('##')[0] || markdownContent; |
||||
|
||||
console.log('=== ORIGINAL MARKDOWN ==='); |
||||
console.log(listSection); |
||||
console.log('\n=== DETECTED FORMAT ==='); |
||||
const format = detectFormat(listSection); |
||||
console.log(format); |
||||
|
||||
console.log('\n=== CONVERTED ASCIIDOC ==='); |
||||
const asciidoc = convertToAsciidoc(listSection, format, '', {}); |
||||
console.log(asciidoc); |
||||
|
||||
// Write to file for inspection
|
||||
fs.writeFileSync(path.join(__dirname, 'debug-asciidoc-output.adoc'), asciidoc); |
||||
console.log('\n=== Written to debug-asciidoc-output.adoc ==='); |
||||
@ -1,55 +0,0 @@
@@ -1,55 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/** |
||||
* Example usage of gc-parser |
||||
* This can be called from Go or used directly in Node.js |
||||
*/ |
||||
|
||||
const { Parser, defaultOptions } = require('./dist/index.js'); |
||||
|
||||
async function main() { |
||||
// Create parser with default options
|
||||
const opts = defaultOptions(); |
||||
opts.linkBaseURL = process.env.LINK_BASE_URL || 'https://example.com'; |
||||
|
||||
const parser = new Parser(opts); |
||||
|
||||
// Get content from command line argument or stdin
|
||||
let content = ''; |
||||
if (process.argv[2]) { |
||||
content = process.argv[2]; |
||||
} else { |
||||
// Read from stdin
|
||||
const readline = require('readline'); |
||||
const rl = readline.createInterface({ |
||||
input: process.stdin, |
||||
output: process.stdout, |
||||
terminal: false |
||||
}); |
||||
|
||||
for await (const line of rl) { |
||||
content += line + '\n'; |
||||
} |
||||
} |
||||
|
||||
if (!content) { |
||||
console.error('No content provided'); |
||||
process.exit(1); |
||||
} |
||||
|
||||
try { |
||||
const result = await parser.process(content); |
||||
|
||||
// Output as JSON for easy parsing
|
||||
console.log(JSON.stringify(result, null, 2)); |
||||
} catch (error) { |
||||
console.error('Error processing content:', error); |
||||
process.exit(1); |
||||
} |
||||
} |
||||
|
||||
if (require.main === module) { |
||||
main(); |
||||
} |
||||
|
||||
module.exports = { main }; |
||||
@ -1,2 +0,0 @@
@@ -1,2 +0,0 @@
|
||||
export {}; |
||||
//# sourceMappingURL=generate-test-report.d.ts.map
|
||||
@ -1 +0,0 @@
@@ -1 +0,0 @@
|
||||
{"version":3,"file":"generate-test-report.d.ts","sourceRoot":"","sources":["generate-test-report.ts"],"names":[],"mappings":""} |
||||
@ -1,91 +0,0 @@
@@ -1,91 +0,0 @@
|
||||
"use strict"; |
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { |
||||
if (k2 === undefined) k2 = k; |
||||
var desc = Object.getOwnPropertyDescriptor(m, k); |
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { |
||||
desc = { enumerable: true, get: function() { return m[k]; } }; |
||||
} |
||||
Object.defineProperty(o, k2, desc); |
||||
}) : (function(o, m, k, k2) { |
||||
if (k2 === undefined) k2 = k; |
||||
o[k2] = m[k]; |
||||
})); |
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { |
||||
Object.defineProperty(o, "default", { enumerable: true, value: v }); |
||||
}) : function(o, v) { |
||||
o["default"] = v; |
||||
}); |
||||
var __importStar = (this && this.__importStar) || (function () { |
||||
var ownKeys = function(o) { |
||||
ownKeys = Object.getOwnPropertyNames || function (o) { |
||||
var ar = []; |
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; |
||||
return ar; |
||||
}; |
||||
return ownKeys(o); |
||||
}; |
||||
return function (mod) { |
||||
if (mod && mod.__esModule) return mod; |
||||
var result = {}; |
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); |
||||
__setModuleDefault(result, mod); |
||||
return result; |
||||
}; |
||||
})(); |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
const parser_1 = require("./src/parser"); |
||||
const report_generator_1 = require("./src/utils/report-generator"); |
||||
const fs = __importStar(require("fs")); |
||||
const path = __importStar(require("path")); |
||||
/** |
||||
* Standalone script to generate HTML test report |
||||
* Run with: npm run test:report |
||||
*/ |
||||
async function main() { |
||||
console.log('📝 Generating test report...\n'); |
||||
// Initialize parser
|
||||
const parser = new parser_1.Parser({ |
||||
linkBaseURL: 'https://example.com', |
||||
wikilinkUrl: '/events?d={dtag}', |
||||
hashtagUrl: '/notes?t={topic}', |
||||
}); |
||||
// Read test documents
|
||||
const markdownPath = path.join(__dirname, 'markdown_testdoc.md'); |
||||
const asciidocPath = path.join(__dirname, 'asciidoc_testdoc.adoc'); |
||||
if (!fs.existsSync(markdownPath)) { |
||||
console.error(`❌ Error: ${markdownPath} not found`); |
||||
process.exit(1); |
||||
} |
||||
if (!fs.existsSync(asciidocPath)) { |
||||
console.error(`❌ Error: ${asciidocPath} not found`); |
||||
process.exit(1); |
||||
} |
||||
const markdownContent = fs.readFileSync(markdownPath, 'utf-8'); |
||||
const asciidocContent = fs.readFileSync(asciidocPath, 'utf-8'); |
||||
console.log('📄 Parsing markdown document...'); |
||||
const markdownResult = await parser.process(markdownContent); |
||||
console.log('📄 Parsing asciidoc document...'); |
||||
const asciidocResult = await parser.process(asciidocContent); |
||||
console.log('🎨 Generating HTML report...'); |
||||
const htmlReport = (0, report_generator_1.generateHTMLReport)({ |
||||
markdown: { |
||||
original: markdownContent, |
||||
result: markdownResult, |
||||
}, |
||||
asciidoc: { |
||||
original: asciidocContent, |
||||
result: asciidocResult, |
||||
}, |
||||
}); |
||||
// Write HTML report to file
|
||||
const reportPath = path.join(__dirname, 'test-report.html'); |
||||
fs.writeFileSync(reportPath, htmlReport, 'utf-8'); |
||||
console.log(`\n✅ Test report generated: ${reportPath}`); |
||||
console.log(` Open this file in your browser to view the results.\n`); |
||||
} |
||||
// Run the script
|
||||
main().catch((error) => { |
||||
console.error('❌ Error generating test report:', error); |
||||
process.exit(1); |
||||
}); |
||||
//# sourceMappingURL=generate-test-report.js.map
|
||||
@ -1 +0,0 @@
@@ -1 +0,0 @@
|
||||
{"version":3,"file":"generate-test-report.js","sourceRoot":"","sources":["generate-test-report.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,yCAAsC;AACtC,mEAA8E;AAC9E,uCAAyB;AACzB,2CAA6B;AAE7B;;;GAGG;AAEH,KAAK,UAAU,IAAI;IACjB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;IAE9C,oBAAoB;IACpB,MAAM,MAAM,GAAG,IAAI,eAAM,CAAC;QACxB,WAAW,EAAE,qBAAqB;QAClC,WAAW,EAAE,kBAAkB;QAC/B,UAAU,EAAE,kBAAkB;KAC/B,CAAC,CAAC;IAEH,sBAAsB;IACtB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,qBAAqB,CAAC,CAAC;IACjE,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,uBAAuB,CAAC,CAAC;IAEnE,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,YAAY,YAAY,YAAY,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,YAAY,YAAY,YAAY,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,eAAe,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,MAAM,eAAe,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAE/D,OAAO,CAAC,GAAG,CAAC,iCAAiC,CAAC,CAAC;IAC/C,MAAM,cAAc,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7D,OAAO,CAAC,GAAG,CAAC,iCAAiC,CAAC,CAAC;IAC/C,MAAM,cAAc,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7D,OAAO,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC;IAC5C,MAAM,UAAU,GAAG,IAAA,qCAAkB,EAAC;QACpC,QAAQ,EAAE;YACR,QAAQ,EAAE,eAAe;YACzB,MAAM,EAAE,cAAc;SACvB;QACD,QAAQ,EAAE;YACR,QAAQ,EAAE,eAAe;YACzB,MAAM,EAAE,cAAc;SACvB;KACF,CAAC,CAAC;IAEH,4BAA4B;IAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,kBAAkB,CAAC,CAAC;IAC5D,EAAE,CAAC,aAAa,CAAC,UAAU,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC;IAElD,OAAO,CAAC,GAAG,CAAC,8BAA8B,UAAU,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;AAC1E,CAAC;AAED,iBAAiB;AACjB,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACrB,OAAO,CAAC,KAAK,CAAC,iCAAiC,EAAE,KAAK,CAAC,CAAC;IACxD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"} |
||||
@ -1,71 +0,0 @@
@@ -1,71 +0,0 @@
|
||||
// Import from source files - this script should be run with ts-node or similar
|
||||
// from the project root, not from dist/
|
||||
import { Parser } from './src/parser'; |
||||
import { generateHTMLReport } from './src/utils/report-generator'; |
||||
import * as fs from 'fs'; |
||||
import * as path from 'path'; |
||||
|
||||
/** |
||||
* Standalone script to generate HTML test report |
||||
* Run with: npm run test:report |
||||
*/ |
||||
|
||||
async function main() { |
||||
console.log('📝 Generating test report...\n'); |
||||
|
||||
// Initialize parser
|
||||
const parser = new Parser({ |
||||
linkBaseURL: 'https://example.com', |
||||
wikilinkUrl: '/events?d={dtag}', |
||||
hashtagUrl: '/notes?t={topic}', |
||||
}); |
||||
|
||||
// Read test documents from project root
|
||||
const baseDir = __dirname.includes('dist') ? path.join(__dirname, '..') : __dirname; |
||||
const markdownPath = path.join(baseDir, 'markdown_testdoc.md'); |
||||
const asciidocPath = path.join(baseDir, 'asciidoc_testdoc.adoc'); |
||||
|
||||
if (!fs.existsSync(markdownPath)) { |
||||
console.error(`❌ Error: ${markdownPath} not found`); |
||||
process.exit(1); |
||||
} |
||||
|
||||
if (!fs.existsSync(asciidocPath)) { |
||||
console.error(`❌ Error: ${asciidocPath} not found`); |
||||
process.exit(1); |
||||
} |
||||
|
||||
const markdownContent = fs.readFileSync(markdownPath, 'utf-8'); |
||||
const asciidocContent = fs.readFileSync(asciidocPath, 'utf-8'); |
||||
|
||||
console.log('📄 Parsing markdown document...'); |
||||
const markdownResult = await parser.process(markdownContent); |
||||
|
||||
console.log('📄 Parsing asciidoc document...'); |
||||
const asciidocResult = await parser.process(asciidocContent); |
||||
|
||||
console.log('🎨 Generating HTML report...'); |
||||
const htmlReport = generateHTMLReport({ |
||||
markdown: { |
||||
original: markdownContent, |
||||
result: markdownResult, |
||||
}, |
||||
asciidoc: { |
||||
original: asciidocContent, |
||||
result: asciidocResult, |
||||
}, |
||||
}); |
||||
|
||||
// Write HTML report to file (adjust path based on where script is run from)
|
||||
const reportPath = path.join(baseDir, 'test-report.html'); |
||||
fs.writeFileSync(reportPath, htmlReport, 'utf-8'); |
||||
|
||||
console.log(`\n✅ Test report generated: ${reportPath}`); |
||||
console.log(` Open this file in your browser to view the results.\n`); |
||||
} |
||||
|
||||
// Run the script
|
||||
main().catch((error) => { |
||||
console.error('❌ Error generating test report:', error); |
||||
process.exit(1); |
||||
}); |
||||
@ -1,24 +1,23 @@
@@ -1,24 +1,23 @@
|
||||
module.exports = { |
||||
preset: 'ts-jest', |
||||
testEnvironment: 'node', |
||||
roots: ['<rootDir>'], |
||||
testMatch: ['**/*.test.ts'], |
||||
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'], |
||||
collectCoverageFrom: [ |
||||
'src/**/*.ts', |
||||
'!src/**/*.d.ts', |
||||
], |
||||
roots: ['<rootDir>/src'], |
||||
testMatch: ['**/__tests__/**/*.test.ts', '**/?(*.)+(spec|test).ts'], |
||||
testPathIgnorePatterns: ['/node_modules/', '/dist/', 'asciidoc.test.ts'], |
||||
transform: { |
||||
'^.+\\.ts$': ['ts-jest', { |
||||
tsconfig: 'tsconfig.test.json', |
||||
tsconfig: { |
||||
esModuleInterop: true, |
||||
}, |
||||
}], |
||||
'^.+\\.js$': 'babel-jest', |
||||
}, |
||||
// Don't transform AsciiDoctor packages - they use Opal runtime which breaks with Jest transformation
|
||||
// AsciiDoctor uses CommonJS and Opal runtime, so we need to exclude it from transformation
|
||||
// The pattern matches paths to ignore (not transform)
|
||||
transformIgnorePatterns: [ |
||||
'node_modules/(?!(@asciidoctor)/)', |
||||
moduleFileExtensions: ['ts', 'js', 'json'], |
||||
moduleNameMapper: { |
||||
'^marked$': '<rootDir>/node_modules/marked/lib/marked.umd.js', |
||||
}, |
||||
collectCoverageFrom: [ |
||||
'src/**/*.ts', |
||||
'!src/**/*.d.ts', |
||||
], |
||||
// Ensure CommonJS modules are handled correctly
|
||||
moduleNameMapper: {}, |
||||
}; |
||||
|
||||
@ -0,0 +1,353 @@
@@ -0,0 +1,353 @@
|
||||
import { Parser } from '../parser'; |
||||
import { readFileSync, writeFileSync, mkdirSync } from 'fs'; |
||||
import { join } from 'path'; |
||||
|
||||
/** |
||||
* Simple test runner for AsciiDoc tests (separate from Jest due to Opal compatibility issues) |
||||
*/ |
||||
async function runAsciiDocTests() { |
||||
console.log('Running AsciiDoc tests...\n'); |
||||
|
||||
const asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8'); |
||||
const parser = new Parser({ |
||||
linkBaseURL: 'https://example.com', |
||||
enableNostrAddresses: true, |
||||
wikilinkUrl: '/events?d={dtag}', |
||||
hashtagUrl: '/hashtag/{topic}' |
||||
}); |
||||
|
||||
let passed = 0; |
||||
let failed = 0; |
||||
const failures: string[] = []; |
||||
|
||||
const testPromises: Promise<void>[] = []; |
||||
|
||||
function test(name: string, fn: () => void | Promise<void>) { |
||||
const testPromise = (async () => { |
||||
try { |
||||
const result = fn(); |
||||
if (result instanceof Promise) { |
||||
await result; |
||||
} |
||||
passed++; |
||||
console.log(`✓ ${name}`); |
||||
} catch (error: any) { |
||||
failed++; |
||||
failures.push(`${name}: ${error.message}`); |
||||
console.error(`✗ ${name}: ${error.message}`); |
||||
} |
||||
})(); |
||||
testPromises.push(testPromise); |
||||
} |
||||
|
||||
function expect(actual: any) { |
||||
return { |
||||
toBeDefined: () => { |
||||
if (actual === undefined || actual === null) { |
||||
throw new Error(`Expected value to be defined, but got ${actual}`); |
||||
} |
||||
}, |
||||
toBe: (expected: any) => { |
||||
if (actual !== expected) { |
||||
throw new Error(`Expected ${expected}, but got ${actual}`); |
||||
} |
||||
}, |
||||
toContain: (substring: string) => { |
||||
if (typeof actual === 'string' && !actual.includes(substring)) { |
||||
throw new Error(`Expected string to contain "${substring}"`); |
||||
} |
||||
}, |
||||
toMatch: (regex: RegExp) => { |
||||
if (typeof actual === 'string' && !regex.test(actual)) { |
||||
throw new Error(`Expected string to match ${regex}`); |
||||
} |
||||
}, |
||||
toHaveProperty: (prop: string) => { |
||||
if (!(prop in actual)) { |
||||
throw new Error(`Expected object to have property "${prop}"`); |
||||
} |
||||
}, |
||||
toBeGreaterThan: (value: number) => { |
||||
if (typeof actual !== 'number' || actual <= value) { |
||||
throw new Error(`Expected ${actual} to be greater than ${value}`); |
||||
} |
||||
}, |
||||
length: { |
||||
toBeGreaterThan: (value: number) => { |
||||
if (!Array.isArray(actual) || actual.length <= value) { |
||||
throw new Error(`Expected array length to be greater than ${value}, but got ${actual.length}`); |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
} |
||||
|
||||
// Run tests
|
||||
const result = await parser.process(asciidocContent); |
||||
|
||||
// Write HTML output to file for inspection
|
||||
const outputDir = join(__dirname, '../../test-output'); |
||||
try { |
||||
mkdirSync(outputDir, { recursive: true }); |
||||
} catch (e) { |
||||
// Directory might already exist
|
||||
} |
||||
|
||||
const htmlOutput = `<!DOCTYPE html>
|
||||
<html lang="en"> |
||||
<head> |
||||
<meta charset="UTF-8"> |
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
||||
<meta name="referrer" content="strict-origin-when-cross-origin"> |
||||
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'unsafe-inline' 'unsafe-eval' https://www.youtube.com https://s.ytimg.com https://www.gstatic.com https://*.googlevideo.com; frame-src https://www.youtube.com https://youtube.com https://open.spotify.com https://*.googlevideo.com; style-src 'unsafe-inline'; img-src 'self' data: https:; media-src 'self' https:; connect-src https:; child-src https://www.youtube.com https://youtube.com;"> |
||||
<title>AsciiDoc Test Output</title> |
||||
<style> |
||||
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; } |
||||
.hashtag { color: #1da1f2; font-weight: 500; } |
||||
.wikilink { color: #0066cc; text-decoration: underline; } |
||||
.nostr-link { color: #8b5cf6; text-decoration: underline; } |
||||
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; } |
||||
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; } |
||||
.line-through { text-decoration: line-through; } |
||||
.highlight { background-color: #ffeb3b; padding: 2px 4px; border-radius: 3px; } |
||||
.bare-image { max-width: 100%; width: auto; height: auto; margin: 10px 0; display: block; } |
||||
.bare-video, .bare-audio { width: 100%; max-width: 800px; margin: 10px 0; display: block; } |
||||
.youtube-embed, .spotify-embed { max-width: 100%; margin: 10px 0; border-radius: 8px; display: block; } |
||||
.youtube-embed { width: 100%; max-width: 640px; height: auto; aspect-ratio: 16/9; border: 0; display: block; } |
||||
.spotify-embed { width: 100%; max-width: 800px; } |
||||
/* Table styles */ |
||||
table { border-collapse: collapse; width: 100%; margin: 1em 0; } |
||||
table thead { background-color: #f2f2f2; } |
||||
table th { font-weight: bold; padding: 8px; border: 1px solid #ddd; background-color: #f2f2f2; } |
||||
table td { padding: 8px; border: 1px solid #ddd; } |
||||
/* Alignment classes - AsciiDoc uses halign-* and valign-* classes */ |
||||
.halign-left { text-align: left !important; } |
||||
.halign-center { text-align: center !important; } |
||||
.halign-right { text-align: right !important; } |
||||
.valign-top { vertical-align: top !important; } |
||||
.valign-middle { vertical-align: middle !important; } |
||||
.valign-bottom { vertical-align: bottom !important; } |
||||
/* Also handle tableblock classes */ |
||||
.tableblock.halign-left { text-align: left !important; } |
||||
.tableblock.halign-center { text-align: center !important; } |
||||
.tableblock.halign-right { text-align: right !important; } |
||||
.tableblock.valign-top { vertical-align: top !important; } |
||||
.tableblock.valign-middle { vertical-align: middle !important; } |
||||
.tableblock.valign-bottom { vertical-align: bottom !important; } |
||||
/* Task list styles */ |
||||
.checklist { list-style: none; padding-left: 0; } |
||||
.checklist li { padding-left: 1.5em; position: relative; margin: 0.5em 0; } |
||||
.checklist li i.fa-check-square-o::before { content: "☑ "; font-style: normal; font-family: sans-serif; } |
||||
.checklist li i.fa-square-o::before { content: "☐ "; font-style: normal; font-family: sans-serif; } |
||||
.checklist li i { position: absolute; left: 0; font-style: normal; } |
||||
/* Fallback if Font Awesome doesn't load */ |
||||
.checklist li i.fa-check-square-o { display: inline-block; width: 1em; } |
||||
.checklist li i.fa-check-square-o:before { content: "☑"; } |
||||
.checklist li i.fa-square-o { display: inline-block; width: 1em; } |
||||
.checklist li i.fa-square-o:before { content: "☐"; } |
||||
/* AsciiDoc specific styles */ |
||||
.sect1, .sect2, .sect3, .sect4, .sect5 { margin-top: 1.5em; margin-bottom: 1em; } |
||||
.paragraph { margin: 1em 0; } |
||||
table { border-collapse: collapse; width: 100%; margin: 1em 0; } |
||||
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; } |
||||
table th { background-color: #f2f2f2; } |
||||
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; } |
||||
</style> |
||||
</head> |
||||
<body> |
||||
<h1>AsciiDoc Test Document - Parsed Output</h1> |
||||
<hr> |
||||
${result.content} |
||||
<hr> |
||||
<h2>Metadata</h2> |
||||
<pre>${JSON.stringify({ |
||||
hasLaTeX: result.hasLaTeX, |
||||
hasMusicalNotation: result.hasMusicalNotation, |
||||
nostrLinks: result.nostrLinks, |
||||
wikilinks: result.wikilinks, |
||||
hashtags: result.hashtags, |
||||
links: result.links, |
||||
media: result.media |
||||
}, null, 2)}</pre> |
||||
</body> |
||||
</html>`;
|
||||
|
||||
const outputPath = join(outputDir, 'asciidoc-output.html'); |
||||
writeFileSync(outputPath, htmlOutput, 'utf-8'); |
||||
console.log(`\n📄 HTML output written to: ${outputPath}\n`); |
||||
|
||||
test('should parse AsciiDoc content', () => { |
||||
expect(result).toBeDefined(); |
||||
expect(result.content).toBeDefined(); |
||||
expect(typeof result.content).toBe('string'); |
||||
expect(result.content.length).toBeGreaterThan(0); |
||||
}); |
||||
|
||||
test('should have HTML content', () => { |
||||
expect(result.content).toContain('<'); |
||||
expect(result.content).toContain('>'); |
||||
}); |
||||
|
||||
test('should extract table of contents', () => { |
||||
expect(result.tableOfContents).toBeDefined(); |
||||
expect(typeof result.tableOfContents).toBe('string'); |
||||
}); |
||||
|
||||
test('should detect LaTeX', () => { |
||||
expect(result.hasLaTeX).toBeDefined(); |
||||
expect(typeof result.hasLaTeX).toBe('boolean'); |
||||
expect(result.hasLaTeX).toBe(true); |
||||
}); |
||||
|
||||
test('should detect musical notation', () => { |
||||
expect(result.hasMusicalNotation).toBeDefined(); |
||||
expect(typeof result.hasMusicalNotation).toBe('boolean'); |
||||
expect(result.hasMusicalNotation).toBe(true); |
||||
}); |
||||
|
||||
test('should extract nostr links', () => { |
||||
expect(result.nostrLinks).toBeDefined(); |
||||
expect(Array.isArray(result.nostrLinks)).toBe(true); |
||||
expect(result.nostrLinks.length).toBeGreaterThan(0); |
||||
|
||||
const nostrLink = result.nostrLinks[0]; |
||||
expect(nostrLink).toHaveProperty('type'); |
||||
expect(nostrLink).toHaveProperty('id'); |
||||
expect(nostrLink).toHaveProperty('text'); |
||||
expect(nostrLink).toHaveProperty('bech32'); |
||||
const validTypes = ['npub', 'nprofile', 'nevent', 'naddr', 'note']; |
||||
if (!validTypes.includes(nostrLink.type)) { |
||||
throw new Error(`Invalid nostr type: ${nostrLink.type}`); |
||||
} |
||||
}); |
||||
|
||||
test('should extract wikilinks', () => { |
||||
expect(result.wikilinks).toBeDefined(); |
||||
expect(Array.isArray(result.wikilinks)).toBe(true); |
||||
expect(result.wikilinks.length).toBeGreaterThan(0); |
||||
|
||||
const wikilink = result.wikilinks[0]; |
||||
expect(wikilink).toHaveProperty('dtag'); |
||||
expect(wikilink).toHaveProperty('display'); |
||||
expect(wikilink).toHaveProperty('original'); |
||||
}); |
||||
|
||||
test('should extract hashtags', () => { |
||||
expect(result.hashtags).toBeDefined(); |
||||
expect(Array.isArray(result.hashtags)).toBe(true); |
||||
expect(result.hashtags.length).toBeGreaterThan(0); |
||||
|
||||
result.hashtags.forEach((tag: string) => { |
||||
if (tag.includes('#')) { |
||||
throw new Error(`Hashtag should not include #: ${tag}`); |
||||
} |
||||
}); |
||||
}); |
||||
|
||||
test('should extract regular links', () => { |
||||
expect(result.links).toBeDefined(); |
||||
expect(Array.isArray(result.links)).toBe(true); |
||||
|
||||
if (result.links.length > 0) { |
||||
const link = result.links[0]; |
||||
expect(link).toHaveProperty('url'); |
||||
expect(link).toHaveProperty('text'); |
||||
expect(link).toHaveProperty('isExternal'); |
||||
expect(typeof link.isExternal).toBe('boolean'); |
||||
} |
||||
}); |
||||
|
||||
test('should extract media URLs', () => { |
||||
expect(result.media).toBeDefined(); |
||||
expect(Array.isArray(result.media)).toBe(true); |
||||
}); |
||||
|
||||
test('should process nostr: addresses in HTML', () => { |
||||
const nostrAddresses = result.nostrLinks; |
||||
expect(nostrAddresses.length).toBeGreaterThan(0); |
||||
|
||||
nostrAddresses.forEach((link: any) => { |
||||
if (!result.content.includes(`data-nostr-type="${link.type}"`)) { |
||||
throw new Error(`Missing nostr type attribute for ${link.type}`); |
||||
} |
||||
if (!result.content.includes(`data-nostr-id="${link.bech32}"`)) { |
||||
throw new Error(`Missing nostr id attribute for ${link.bech32}`); |
||||
} |
||||
}); |
||||
}); |
||||
|
||||
test('should process wikilinks in HTML', () => { |
||||
const wikilinks = result.wikilinks; |
||||
expect(wikilinks.length).toBeGreaterThan(0); |
||||
|
||||
wikilinks.forEach((wikilink: any) => { |
||||
if (!result.content.includes(`class="wikilink"`)) { |
||||
throw new Error('Missing wikilink class'); |
||||
} |
||||
if (!result.content.includes(`data-dtag="${wikilink.dtag}"`)) { |
||||
throw new Error(`Missing dtag attribute for ${wikilink.dtag}`); |
||||
} |
||||
}); |
||||
}); |
||||
|
||||
test('should process hashtags in HTML', () => { |
||||
const hashtags = result.hashtags; |
||||
expect(hashtags.length).toBeGreaterThan(0); |
||||
|
||||
hashtags.forEach((tag: string) => { |
||||
if (!result.content.includes(`data-topic="${tag}"`)) { |
||||
throw new Error(`Missing topic attribute for ${tag}`); |
||||
} |
||||
if (!result.content.includes('class="hashtag"')) { |
||||
throw new Error('Missing hashtag class'); |
||||
} |
||||
}); |
||||
}); |
||||
|
||||
test('should contain expected content sections', () => { |
||||
if (!/Bullet list|bullet/i.test(result.content)) { |
||||
throw new Error('Missing bullet list section'); |
||||
} |
||||
if (!/Headers|header/i.test(result.content)) { |
||||
throw new Error('Missing headers section'); |
||||
} |
||||
if (!/Media and Links|media|links/i.test(result.content)) { |
||||
throw new Error('Missing media and links section'); |
||||
} |
||||
}); |
||||
|
||||
test('should return consistent structure', () => { |
||||
expect(result).toHaveProperty('content'); |
||||
expect(result).toHaveProperty('tableOfContents'); |
||||
expect(result).toHaveProperty('hasLaTeX'); |
||||
expect(result).toHaveProperty('hasMusicalNotation'); |
||||
expect(result).toHaveProperty('nostrLinks'); |
||||
expect(result).toHaveProperty('wikilinks'); |
||||
expect(result).toHaveProperty('hashtags'); |
||||
expect(result).toHaveProperty('links'); |
||||
expect(result).toHaveProperty('media'); |
||||
}); |
||||
|
||||
// Wait for all tests to complete
|
||||
await Promise.all(testPromises); |
||||
|
||||
// Print summary
|
||||
console.log(`\n${'='.repeat(50)}`); |
||||
console.log(`Tests passed: ${passed}`); |
||||
console.log(`Tests failed: ${failed}`); |
||||
|
||||
if (failures.length > 0) { |
||||
console.log('\nFailures:'); |
||||
failures.forEach(f => console.error(` - ${f}`)); |
||||
process.exit(1); |
||||
} else { |
||||
console.log('\nAll tests passed!'); |
||||
process.exit(0); |
||||
} |
||||
} |
||||
|
||||
// Run tests
|
||||
runAsciiDocTests().catch(error => { |
||||
console.error('Test runner error:', error); |
||||
process.exit(1); |
||||
}); |
||||
@ -0,0 +1,238 @@
@@ -0,0 +1,238 @@
|
||||
import { Parser } from '../parser'; |
||||
import { readFileSync, writeFileSync, mkdirSync } from 'fs'; |
||||
import { join } from 'path'; |
||||
|
||||
describe('Parser', () => { |
||||
let asciidocContent: string; |
||||
let markdownContent: string; |
||||
|
||||
beforeAll(() => { |
||||
asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8'); |
||||
markdownContent = readFileSync(join(__dirname, '../../markdown_testdoc.md'), 'utf-8'); |
||||
}); |
||||
|
||||
// AsciiDoc tests are run separately using a Node.js script (asciidoc.test.ts)
|
||||
// due to Jest/Opal runtime compatibility issues
|
||||
// Run with: npm run test:asciidoc
|
||||
|
||||
describe('Markdown Test Document', () => { |
||||
let result: any; |
||||
|
||||
beforeAll(async () => { |
||||
const parser = new Parser({ |
||||
linkBaseURL: 'https://example.com', |
||||
enableNostrAddresses: true, |
||||
wikilinkUrl: '/events?d={dtag}', |
||||
hashtagUrl: '/hashtag/{topic}' |
||||
}); |
||||
result = await parser.process(markdownContent); |
||||
|
||||
// Write HTML output to file for inspection
|
||||
const outputDir = join(__dirname, '../../test-output'); |
||||
try { |
||||
mkdirSync(outputDir, { recursive: true }); |
||||
} catch (e) { |
||||
// Directory might already exist
|
||||
} |
||||
|
||||
const htmlOutput = `<!DOCTYPE html>
|
||||
<html lang="en"> |
||||
<head> |
||||
<meta charset="UTF-8"> |
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
||||
<title>Markdown Test Output</title> |
||||
<style> |
||||
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; } |
||||
.hashtag { color: #1da1f2; font-weight: 500; } |
||||
.wikilink { color: #0066cc; text-decoration: underline; } |
||||
.nostr-link { color: #8b5cf6; text-decoration: underline; } |
||||
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; } |
||||
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; } |
||||
.bare-image, .bare-video, .bare-audio { max-width: 100%; margin: 10px 0; } |
||||
.bare-video, .bare-audio { width: 100%; max-width: 600px; } |
||||
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; } |
||||
table { border-collapse: collapse; width: 100%; margin: 1em 0; } |
||||
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; } |
||||
table th { background-color: #f2f2f2; } |
||||
</style> |
||||
</head> |
||||
<body> |
||||
<h1>Markdown Test Document - Parsed Output</h1> |
||||
<hr> |
||||
${result.content} |
||||
<hr> |
||||
<h2>Metadata</h2> |
||||
<pre>${JSON.stringify({ |
||||
frontmatter: result.frontmatter, |
||||
hasLaTeX: result.hasLaTeX, |
||||
hasMusicalNotation: result.hasMusicalNotation, |
||||
nostrLinks: result.nostrLinks, |
||||
wikilinks: result.wikilinks, |
||||
hashtags: result.hashtags, |
||||
links: result.links, |
||||
media: result.media |
||||
}, null, 2)}</pre> |
||||
</body> |
||||
</html>`;
|
||||
|
||||
const outputPath = join(outputDir, 'markdown-output.html'); |
||||
writeFileSync(outputPath, htmlOutput, 'utf-8'); |
||||
// Use console.info to ensure it shows in Jest output
|
||||
console.info(`\n📄 HTML output written to: ${outputPath}\n`); |
||||
}); |
||||
|
||||
it('should parse Markdown content', () => { |
||||
expect(result).toBeDefined(); |
||||
expect(result.content).toBeDefined(); |
||||
expect(typeof result.content).toBe('string'); |
||||
expect(result.content.length).toBeGreaterThan(0); |
||||
}); |
||||
|
||||
it('should have HTML content', () => { |
||||
expect(result.content).toContain('<'); |
||||
expect(result.content).toContain('>'); |
||||
}); |
||||
|
||||
it('should extract frontmatter', () => { |
||||
expect(result.frontmatter).toBeDefined(); |
||||
expect(typeof result.frontmatter).toBe('object'); |
||||
expect(result.frontmatter).toHaveProperty('author'); |
||||
expect(result.frontmatter.author).toBe('James Smith'); |
||||
expect(result.frontmatter).toHaveProperty('summary'); |
||||
expect(result.frontmatter.summary).toBe('This is a summary'); |
||||
}); |
||||
|
||||
it('should detect LaTeX', () => { |
||||
expect(result.hasLaTeX).toBeDefined(); |
||||
expect(typeof result.hasLaTeX).toBe('boolean'); |
||||
// The test doc has LaTeX, so it should be true
|
||||
expect(result.hasLaTeX).toBe(true); |
||||
}); |
||||
|
||||
it('should detect musical notation', () => { |
||||
expect(result.hasMusicalNotation).toBeDefined(); |
||||
expect(typeof result.hasMusicalNotation).toBe('boolean'); |
||||
}); |
||||
|
||||
it('should extract nostr links', () => { |
||||
expect(result.nostrLinks).toBeDefined(); |
||||
expect(Array.isArray(result.nostrLinks)).toBe(true); |
||||
expect(result.nostrLinks.length).toBeGreaterThan(0); |
||||
|
||||
// Check that nostr: addresses are extracted
|
||||
const nostrLink = result.nostrLinks[0]; |
||||
expect(nostrLink).toHaveProperty('type'); |
||||
expect(nostrLink).toHaveProperty('id'); |
||||
expect(nostrLink).toHaveProperty('text'); |
||||
expect(nostrLink).toHaveProperty('bech32'); |
||||
expect(['npub', 'nprofile', 'nevent', 'naddr', 'note']).toContain(nostrLink.type); |
||||
}); |
||||
|
||||
it('should extract wikilinks', () => { |
||||
expect(result.wikilinks).toBeDefined(); |
||||
expect(Array.isArray(result.wikilinks)).toBe(true); |
||||
expect(result.wikilinks.length).toBeGreaterThan(0); |
||||
|
||||
// Check wikilink structure
|
||||
const wikilink = result.wikilinks[0]; |
||||
expect(wikilink).toHaveProperty('dtag'); |
||||
expect(wikilink).toHaveProperty('display'); |
||||
expect(wikilink).toHaveProperty('original'); |
||||
}); |
||||
|
||||
it('should extract hashtags', () => { |
||||
expect(result.hashtags).toBeDefined(); |
||||
expect(Array.isArray(result.hashtags)).toBe(true); |
||||
expect(result.hashtags.length).toBeGreaterThan(0); |
||||
|
||||
// Hashtags should not include the # symbol
|
||||
result.hashtags.forEach((tag: string) => { |
||||
expect(tag).not.toContain('#'); |
||||
}); |
||||
}); |
||||
|
||||
it('should extract regular links', () => { |
||||
expect(result.links).toBeDefined(); |
||||
expect(Array.isArray(result.links)).toBe(true); |
||||
|
||||
if (result.links.length > 0) { |
||||
const link = result.links[0]; |
||||
expect(link).toHaveProperty('url'); |
||||
expect(link).toHaveProperty('text'); |
||||
expect(link).toHaveProperty('isExternal'); |
||||
expect(typeof link.isExternal).toBe('boolean'); |
||||
} |
||||
}); |
||||
|
||||
it('should extract media URLs', () => { |
||||
expect(result.media).toBeDefined(); |
||||
expect(Array.isArray(result.media)).toBe(true); |
||||
}); |
||||
|
||||
it('should process nostr: addresses in HTML', () => { |
||||
// Check that nostr: addresses are converted to links
|
||||
const nostrAddresses = result.nostrLinks; |
||||
expect(nostrAddresses.length).toBeGreaterThan(0); |
||||
|
||||
// Check that HTML contains links for nostr addresses
|
||||
nostrAddresses.forEach((link: any) => { |
||||
expect(result.content).toContain(`data-nostr-type="${link.type}"`); |
||||
expect(result.content).toContain(`data-nostr-id="${link.bech32}"`); |
||||
}); |
||||
}); |
||||
|
||||
it('should process wikilinks in HTML', () => { |
||||
// Check that wikilinks are converted to links
|
||||
const wikilinks = result.wikilinks; |
||||
expect(wikilinks.length).toBeGreaterThan(0); |
||||
|
||||
wikilinks.forEach((wikilink: any) => { |
||||
expect(result.content).toContain(`class="wikilink"`); |
||||
expect(result.content).toContain(`data-dtag="${wikilink.dtag}"`); |
||||
}); |
||||
}); |
||||
|
||||
it('should process hashtags in HTML', () => { |
||||
// Check that hashtags are processed
|
||||
const hashtags = result.hashtags; |
||||
expect(hashtags.length).toBeGreaterThan(0); |
||||
|
||||
hashtags.forEach((tag: string) => { |
||||
expect(result.content).toContain(`data-topic="${tag}"`); |
||||
expect(result.content).toMatch(new RegExp(`class="hashtag"`)); |
||||
}); |
||||
}); |
||||
|
||||
it('should contain expected content sections', () => { |
||||
// Check for some expected content from the test doc
|
||||
expect(result.content).toMatch(/Bullet list|bullet/i); |
||||
expect(result.content).toMatch(/Headers|header/i); |
||||
expect(result.content).toMatch(/Media and Links|media|links/i); |
||||
}); |
||||
|
||||
it('should have empty table of contents for markdown', () => { |
||||
// Markdown doesn't generate TOC by default
|
||||
expect(result.tableOfContents).toBeDefined(); |
||||
expect(typeof result.tableOfContents).toBe('string'); |
||||
}); |
||||
}); |
||||
|
||||
describe('Result structure validation', () => { |
||||
|
||||
it('should return consistent structure for Markdown', async () => { |
||||
const parser = new Parser(); |
||||
const result = await parser.process(markdownContent); |
||||
|
||||
// Check all required fields
|
||||
expect(result).toHaveProperty('content'); |
||||
expect(result).toHaveProperty('tableOfContents'); |
||||
expect(result).toHaveProperty('hasLaTeX'); |
||||
expect(result).toHaveProperty('hasMusicalNotation'); |
||||
expect(result).toHaveProperty('nostrLinks'); |
||||
expect(result).toHaveProperty('wikilinks'); |
||||
expect(result).toHaveProperty('hashtags'); |
||||
expect(result).toHaveProperty('links'); |
||||
expect(result).toHaveProperty('media'); |
||||
}); |
||||
}); |
||||
}); |
||||
@ -1,330 +0,0 @@
@@ -1,330 +0,0 @@
|
||||
import { ContentFormat } from '../types'; |
||||
|
||||
export interface ConvertOptions { |
||||
enableNostrAddresses?: boolean; |
||||
} |
||||
|
||||
/** |
||||
* Converts content from various formats (Markdown, Wikipedia, Plain) to AsciiDoc |
||||
*
|
||||
* Processing order: |
||||
* 1. Convert special syntax (wikilinks, hashtags, nostr links) to placeholders |
||||
* 2. Process media URLs (YouTube, Spotify, video, audio) |
||||
* 3. Process images (Markdown and bare URLs) |
||||
* 4. Process links (Markdown and bare URLs) |
||||
* 5. Clean URLs (remove tracking parameters) |
||||
*/ |
||||
export function convertToAsciidoc( |
||||
content: string, |
||||
format: ContentFormat, |
||||
linkBaseURL?: string, |
||||
options: ConvertOptions = {} |
||||
): string { |
||||
let processed = content; |
||||
|
||||
// Step 1: Convert special syntax to placeholders (before other processing)
|
||||
processed = convertWikilinks(processed); |
||||
processed = convertHashtags(processed); |
||||
|
||||
if (options.enableNostrAddresses !== false) { |
||||
processed = convertNostrLinks(processed); |
||||
} |
||||
|
||||
// Step 2: Process media URLs (before link processing to avoid conflicts)
|
||||
processed = processMediaUrls(processed); |
||||
|
||||
// Step 3: Process images (before links to avoid conflicts)
|
||||
processed = processImages(processed, format); |
||||
|
||||
// Step 4: Process links (Markdown and bare URLs)
|
||||
processed = processLinks(processed, format); |
||||
|
||||
// Step 5: Convert format-specific syntax
|
||||
if (format === ContentFormat.Markdown) { |
||||
processed = convertMarkdownToAsciidoc(processed); |
||||
} else if (format === ContentFormat.Wikipedia) { |
||||
processed = convertWikipediaToAsciidoc(processed); |
||||
} |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
/** |
||||
* Convert wikilinks [[target]] or [[target|display]] to WIKILINK:dtag|display |
||||
*/ |
||||
function convertWikilinks(content: string): string { |
||||
return content.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, display) => { |
||||
const dtag = normalizeDtag(target.trim()); |
||||
const displayText = display ? display.trim() : target.trim(); |
||||
return `WIKILINK:${dtag}|${displayText}`; |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Normalize dtag (lowercase, replace spaces with hyphens) |
||||
*/ |
||||
function normalizeDtag(dtag: string): string { |
||||
return dtag.toLowerCase().replace(/\s+/g, '-'); |
||||
} |
||||
|
||||
/** |
||||
* Convert hashtags #topic to hashtag:topic[topic] |
||||
* Skip hashtags in URLs, code blocks, and inline code |
||||
*/ |
||||
function convertHashtags(content: string): string { |
||||
// Protect code blocks
|
||||
const codeBlocks: string[] = []; |
||||
content = content.replace(/```[\s\S]*?```/g, (match) => { |
||||
const placeholder = `__CODEBLOCK_${codeBlocks.length}__`; |
||||
codeBlocks.push(match); |
||||
return placeholder; |
||||
}); |
||||
|
||||
// Protect inline code
|
||||
const inlineCode: string[] = []; |
||||
content = content.replace(/`[^`]+`/g, (match) => { |
||||
const placeholder = `__INLINECODE_${inlineCode.length}__`; |
||||
inlineCode.push(match); |
||||
return placeholder; |
||||
}); |
||||
|
||||
// Convert hashtags (not in URLs)
|
||||
content = content.replace(/(?<!https?:\/\/[^\s]*)#([a-zA-Z0-9_]+)/g, (_match, topic) => { |
||||
const normalized = topic.toLowerCase(); |
||||
return `hashtag:${normalized}[#${topic}]`; |
||||
}); |
||||
|
||||
// Restore inline code
|
||||
inlineCode.forEach((code, index) => { |
||||
content = content.replace(`__INLINECODE_${index}__`, code); |
||||
}); |
||||
|
||||
// Restore code blocks
|
||||
codeBlocks.forEach((block, index) => { |
||||
content = content.replace(`__CODEBLOCK_${index}__`, block); |
||||
}); |
||||
|
||||
return content; |
||||
} |
||||
|
||||
/** |
||||
* Convert nostr: links to link:nostr:...[...] |
||||
*/ |
||||
function convertNostrLinks(content: string): string { |
||||
// Match nostr:npub1..., nostr:note1..., etc.
|
||||
return content.replace(/nostr:([a-z0-9]+[a-z0-9]{50,})/gi, (match, bech32Id) => { |
||||
// Extract display text (first few chars)
|
||||
const display = bech32Id.substring(0, 8) + '...'; |
||||
return `link:nostr:${bech32Id}[${display}]`; |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Process media URLs and convert to MEDIA: placeholders |
||||
*/ |
||||
function processMediaUrls(content: string): string { |
||||
let processed = content; |
||||
|
||||
// YouTube URLs
|
||||
processed = processed.replace( |
||||
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]+)/g, |
||||
(_match, videoId) => `MEDIA:youtube:${videoId}` |
||||
); |
||||
|
||||
// Spotify URLs
|
||||
processed = processed.replace( |
||||
/(?:https?:\/\/)?(?:open\.)?spotify\.com\/(track|album|playlist|artist|episode|show)\/([a-zA-Z0-9]+)/g, |
||||
(_match, type, id) => `MEDIA:spotify:${type}:${id}` |
||||
); |
||||
|
||||
// Video files
|
||||
processed = processed.replace( |
||||
/(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv))/gi, |
||||
(_match, url) => `MEDIA:video:${url}` |
||||
); |
||||
|
||||
// Audio files
|
||||
processed = processed.replace( |
||||
/(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp3|m4a|wav|flac|aac|opus|wma|ogg))/gi, |
||||
(_match, url) => `MEDIA:audio:${url}` |
||||
); |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
/** |
||||
* Process images (Markdown syntax and bare URLs) |
||||
*/ |
||||
function processImages(content: string, format: ContentFormat): string { |
||||
let processed = content; |
||||
|
||||
// Markdown image syntax: 
|
||||
processed = processed.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => { |
||||
const cleanedUrl = cleanUrl(url); |
||||
const cleanAlt = alt.trim(); |
||||
return `image::${cleanedUrl}[${cleanAlt ? cleanAlt + ',' : ''}width=100%]`; |
||||
}); |
||||
|
||||
// Bare image URLs (only if not already in a link or image tag)
|
||||
if (format === ContentFormat.Markdown || format === ContentFormat.Plain) { |
||||
const imageUrlPattern = /(?<!\]\()(?<!image::)(?<!link:)(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(jpeg|jpg|png|gif|webp|svg))/gi; |
||||
processed = processed.replace(imageUrlPattern, (match, url) => { |
||||
const cleanedUrl = cleanUrl(url); |
||||
return `image::${cleanedUrl}[width=100%]`; |
||||
}); |
||||
} |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
/** |
||||
* Process links (Markdown syntax and bare URLs) |
||||
*/ |
||||
function processLinks(content: string, format: ContentFormat): string { |
||||
let processed = content; |
||||
|
||||
// Markdown link syntax: [text](url)
|
||||
processed = processed.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => { |
||||
// Skip if this is already processed as an image
|
||||
if (text.startsWith('!')) { |
||||
return _match; |
||||
} |
||||
const cleanedUrl = cleanUrl(url); |
||||
return `link:${cleanedUrl}[${text}]`; |
||||
}); |
||||
|
||||
// Bare URLs (only for Markdown and Plain formats)
|
||||
if (format === ContentFormat.Markdown || format === ContentFormat.Plain) { |
||||
processed = processBareUrls(processed); |
||||
} |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
/** |
||||
* Process bare URLs and convert to link: macros |
||||
* Handles http://, https://, www., and wss:// URLs
|
||||
*/ |
||||
function processBareUrls(content: string): string { |
||||
// URL pattern: matches http://, https://, www., and wss://
|
||||
// Negative lookbehind to avoid matching URLs after ":" (e.g., "hyperlink: www.example.com")
|
||||
const urlPattern = /(?<!:\s)(?<!\]\()\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+|wss:\/\/[^\s<>"{}|\\^`\[\]()]+|www\.[^\s<>"{}|\\^`\[\]()]+)/gi; |
||||
|
||||
return content.replace(urlPattern, (match, url) => { |
||||
// Skip if already in a link or image macro
|
||||
if (match.includes('link:') || match.includes('image::')) { |
||||
return match; |
||||
} |
||||
|
||||
let fullUrl = url; |
||||
let displayText = url; |
||||
|
||||
// Handle www. URLs
|
||||
if (url.startsWith('www.')) { |
||||
fullUrl = 'https://' + url; |
||||
displayText = url; |
||||
} |
||||
// Handle wss:// URLs - convert to https:// for the link, but keep wss:// in display
|
||||
else if (url.startsWith('wss://')) { |
||||
fullUrl = url.replace(/^wss:\/\//, 'https://'); |
||||
displayText = url; // Keep wss:// in display text
|
||||
} |
||||
|
||||
// Clean the URL (remove tracking parameters)
|
||||
fullUrl = cleanUrl(fullUrl); |
||||
|
||||
// Create AsciiDoc link macro
|
||||
return `link:${fullUrl}[${displayText}]`; |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Clean URL by removing tracking parameters |
||||
*/ |
||||
function cleanUrl(url: string): string { |
||||
try { |
||||
const parsedUrl = new URL(url); |
||||
|
||||
// List of tracking parameters to remove
|
||||
const trackingParams = [ |
||||
// Google Analytics & Ads
|
||||
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', |
||||
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic', |
||||
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid', |
||||
|
||||
// Facebook
|
||||
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref', |
||||
|
||||
// Twitter/X
|
||||
'twclid', 'twsrc', |
||||
|
||||
// Microsoft/Bing
|
||||
'msclkid', 'mc_cid', 'mc_eid', |
||||
|
||||
// Adobe
|
||||
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid', |
||||
|
||||
// Mailchimp
|
||||
'mc_cid', 'mc_eid', |
||||
|
||||
// HubSpot
|
||||
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver', |
||||
|
||||
// Marketo
|
||||
'mkt_tok', |
||||
|
||||
// YouTube
|
||||
'si', 'feature', 'kw', 'pp', |
||||
|
||||
// Other common tracking
|
||||
'ref', 'referrer', 'source', 'campaign', 'medium', 'content', |
||||
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd', |
||||
|
||||
// Mobile app tracking
|
||||
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative', |
||||
|
||||
// Amazon
|
||||
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag', |
||||
|
||||
// Affiliate tracking
|
||||
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer', |
||||
|
||||
// Social media share tracking
|
||||
'share', 'shared', 'sharesource' |
||||
]; |
||||
|
||||
// Remove all tracking parameters
|
||||
trackingParams.forEach(param => { |
||||
parsedUrl.searchParams.delete(param); |
||||
}); |
||||
|
||||
// Remove any parameter that starts with utm_ or _
|
||||
Array.from(parsedUrl.searchParams.keys()).forEach(key => { |
||||
if (key.startsWith('utm_') || key.startsWith('_')) { |
||||
parsedUrl.searchParams.delete(key); |
||||
} |
||||
}); |
||||
|
||||
return parsedUrl.toString(); |
||||
} catch { |
||||
// If URL parsing fails, return original URL
|
||||
return url; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Convert Markdown-specific syntax to AsciiDoc |
||||
*/ |
||||
function convertMarkdownToAsciidoc(content: string): string { |
||||
// Most Markdown syntax is handled by AsciiDoctor's markdown support
|
||||
// This function can be extended for additional conversions if needed
|
||||
return content; |
||||
} |
||||
|
||||
/** |
||||
* Convert Wikipedia-specific syntax to AsciiDoc |
||||
*/ |
||||
function convertWikipediaToAsciidoc(content: string): string { |
||||
// Wikipedia-specific conversions can be added here
|
||||
return content; |
||||
} |
||||
@ -1,70 +0,0 @@
@@ -1,70 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.detectFormat = detectFormat; |
||||
const types_1 = require("./types"); |
||||
/** |
||||
* Detects the content format based on content patterns |
||||
*/ |
||||
function detectFormat(content) { |
||||
// Check for AsciiDoc indicators
|
||||
const asciidocIndicators = [ |
||||
'= ', // Title
|
||||
'== ', // Section
|
||||
'=== ', // Subsection
|
||||
'include::', // Include directive
|
||||
'image::', // Image block
|
||||
'[source', // Source block
|
||||
'----', // Listing block
|
||||
'....', // Literal block
|
||||
'|===', // Table
|
||||
'link:', // AsciiDoc link format
|
||||
'wikilink:', // Wikilink macro
|
||||
'hashtag:', // Hashtag macro
|
||||
]; |
||||
let asciidocScore = 0; |
||||
for (const indicator of asciidocIndicators) { |
||||
if (content.includes(indicator)) { |
||||
asciidocScore++; |
||||
} |
||||
} |
||||
// Check for Wikipedia markup indicators (== Heading == format)
|
||||
const wikipediaIndicators = [ |
||||
/^==+\s+.+?\s+==+$/m, // Wikipedia headings: == Heading ==
|
||||
/\[\[[^\]]+\]\]/, // Wikipedia links: [[Page]]
|
||||
/''[^']+''/, // Wikipedia bold: ''text''
|
||||
/'[^']+'/, // Wikipedia italic: 'text'
|
||||
]; |
||||
let wikipediaScore = 0; |
||||
for (const indicator of wikipediaIndicators) { |
||||
if (indicator.test(content)) { |
||||
wikipediaScore++; |
||||
} |
||||
} |
||||
// Check for Markdown indicators (more specific patterns to avoid false positives)
|
||||
const markdownIndicators = [ |
||||
/^#{1,6}\s+/m, // Heading at start of line
|
||||
/```[\s\S]*?```/, // Code block
|
||||
/\*\*[^*]+\*\*/, // Bold text
|
||||
/^[-*+]\s+/m, // List item at start of line
|
||||
/!\[[^\]]*\]\([^)]+\)/, // Image syntax
|
||||
/\[[^\]]+\]\([^)]+\)/, // Link syntax
|
||||
]; |
||||
let markdownScore = 0; |
||||
for (const indicator of markdownIndicators) { |
||||
if (indicator.test(content)) { |
||||
markdownScore++; |
||||
} |
||||
} |
||||
// Determine format based on scores
|
||||
// Wikipedia format takes precedence if detected (it's more specific)
|
||||
if (wikipediaScore > 0 && wikipediaScore >= 2) { |
||||
return types_1.ContentFormat.Wikipedia; |
||||
} |
||||
else if (asciidocScore > markdownScore && asciidocScore >= 2) { |
||||
return types_1.ContentFormat.AsciiDoc; |
||||
} |
||||
else if (markdownScore > 0) { |
||||
return types_1.ContentFormat.Markdown; |
||||
} |
||||
return types_1.ContentFormat.Plain; |
||||
} |
||||
@ -1,160 +0,0 @@
@@ -1,160 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.extractFrontmatter = extractFrontmatter; |
||||
/** |
||||
* Extracts front matter from content |
||||
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value) |
||||
* Returns the front matter object and the content |
||||
* For YAML: removes front matter from content |
||||
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output) |
||||
*/ |
||||
function extractFrontmatter(content) { |
||||
// First, try to match YAML front matter: ---\n...\n---
|
||||
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/; |
||||
const yamlMatch = content.match(yamlFrontmatterRegex); |
||||
if (yamlMatch) { |
||||
const yamlContent = yamlMatch[1]; |
||||
const contentWithoutFrontmatter = yamlMatch[2]; |
||||
// Simple YAML parser for basic key-value pairs and arrays
|
||||
// This is a basic implementation - for complex YAML, consider using a library
|
||||
const frontmatter = {}; |
||||
const lines = yamlContent.split('\n'); |
||||
let currentKey = null; |
||||
let inArray = false; |
||||
let arrayKey = null; |
||||
for (let i = 0; i < lines.length; i++) { |
||||
const line = lines[i]; |
||||
const trimmed = line.trim(); |
||||
// Skip empty lines and comments
|
||||
if (!trimmed || trimmed.startsWith('#')) { |
||||
if (inArray && trimmed === '') { |
||||
// Empty line might end the array
|
||||
inArray = false; |
||||
arrayKey = null; |
||||
} |
||||
continue; |
||||
} |
||||
// Array item (line starting with -)
|
||||
if (trimmed.startsWith('- ')) { |
||||
const item = trimmed.substring(2).trim(); |
||||
const cleanItem = item.replace(/^["']|["']$/g, ''); |
||||
if (arrayKey && frontmatter[arrayKey]) { |
||||
frontmatter[arrayKey].push(cleanItem); |
||||
} |
||||
else if (currentKey) { |
||||
// Start new array
|
||||
arrayKey = currentKey; |
||||
inArray = true; |
||||
frontmatter[currentKey] = [cleanItem]; |
||||
} |
||||
continue; |
||||
} |
||||
// Key-value pair
|
||||
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/); |
||||
if (keyValueMatch) { |
||||
const key = keyValueMatch[1]; |
||||
let value = keyValueMatch[2].trim(); |
||||
// Remove quotes if present
|
||||
if ((value.startsWith('"') && value.endsWith('"')) || |
||||
(value.startsWith("'") && value.endsWith("'"))) { |
||||
value = value.slice(1, -1); |
||||
} |
||||
frontmatter[key] = value; |
||||
currentKey = key; |
||||
inArray = false; |
||||
arrayKey = null; |
||||
continue; |
||||
} |
||||
} |
||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter }; |
||||
} |
||||
// If no YAML front matter, try to extract AsciiDoc document header attributes
|
||||
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
|
||||
// Match header lines until we hit a blank line (which separates header from body)
|
||||
// The header consists of: title line, optional author/revision lines, and attribute lines
|
||||
const lines = content.split('\n'); |
||||
let headerEndIndex = 0; |
||||
// Find where the header ends (first blank line after title/attributes)
|
||||
if (lines[0] && lines[0].match(/^=+\s+/)) { |
||||
// We have a title line, now find where header ends
|
||||
let i = 1; |
||||
// Skip author and revision lines (non-empty lines that don't start with :)
|
||||
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) { |
||||
i++; |
||||
} |
||||
// Now skip attribute lines (lines starting with :)
|
||||
while (i < lines.length && lines[i].trim().startsWith(':')) { |
||||
i++; |
||||
} |
||||
// Skip the blank line that separates header from body
|
||||
if (i < lines.length && lines[i].trim() === '') { |
||||
i++; |
||||
} |
||||
headerEndIndex = i; |
||||
} |
||||
// If we found a header, extract it
|
||||
if (headerEndIndex > 0) { |
||||
const headerLines = lines.slice(0, headerEndIndex); |
||||
const headerContent = headerLines.join('\n'); |
||||
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n'); |
||||
const frontmatter = {}; |
||||
const headerLinesArray = headerContent.split('\n'); |
||||
// Extract title (first line starting with =)
|
||||
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/); |
||||
if (titleMatch) { |
||||
frontmatter.title = titleMatch[1].trim(); |
||||
} |
||||
// Extract author (line after title, if it doesn't start with :)
|
||||
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) { |
||||
const authorLine = headerLinesArray[1].trim(); |
||||
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) { |
||||
// Not a revision line (which has numbers, commas, colons)
|
||||
frontmatter.author = authorLine; |
||||
} |
||||
} |
||||
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
|
||||
for (let i = 1; i < headerLinesArray.length; i++) { |
||||
const line = headerLinesArray[i].trim(); |
||||
if (line.match(/^[\d.,\s:]+$/)) { |
||||
// This looks like a revision line
|
||||
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/); |
||||
if (revisionMatch) { |
||||
frontmatter.version = revisionMatch[1].trim(); |
||||
frontmatter.date = revisionMatch[2].trim(); |
||||
if (revisionMatch[3]) { |
||||
frontmatter.revision = revisionMatch[3].trim(); |
||||
} |
||||
} |
||||
break; |
||||
} |
||||
} |
||||
// Extract AsciiDoc attributes (:key: value)
|
||||
for (const line of headerLinesArray) { |
||||
const trimmed = line.trim(); |
||||
if (trimmed.startsWith(':') && trimmed.includes(':')) { |
||||
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/); |
||||
if (attrMatch) { |
||||
const key = attrMatch[1].trim(); |
||||
let value = attrMatch[2].trim(); |
||||
// Remove quotes if present
|
||||
if ((value.startsWith('"') && value.endsWith('"')) || |
||||
(value.startsWith("'") && value.endsWith("'"))) { |
||||
value = value.slice(1, -1); |
||||
} |
||||
// Handle comma-separated values (like keywords)
|
||||
if (value.includes(',') && !value.includes(' ')) { |
||||
frontmatter[key] = value.split(',').map((v) => v.trim()); |
||||
} |
||||
else { |
||||
frontmatter[key] = value; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
|
||||
// AsciiDoctor can work without the header, and we've already extracted the metadata
|
||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader }; |
||||
} |
||||
// No front matter found
|
||||
return { content }; |
||||
} |
||||
@ -1,177 +0,0 @@
@@ -1,177 +0,0 @@
|
||||
/** |
||||
* Extracts front matter from content |
||||
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value) |
||||
* Returns the front matter object and the content |
||||
* For YAML: removes front matter from content |
||||
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output) |
||||
*/ |
||||
export function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } { |
||||
// First, try to match YAML front matter: ---\n...\n---
|
||||
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/; |
||||
const yamlMatch = content.match(yamlFrontmatterRegex); |
||||
|
||||
if (yamlMatch) { |
||||
const yamlContent = yamlMatch[1]; |
||||
const contentWithoutFrontmatter = yamlMatch[2]; |
||||
|
||||
// Simple YAML parser for basic key-value pairs and arrays
|
||||
// This is a basic implementation - for complex YAML, consider using a library
|
||||
const frontmatter: Record<string, any> = {}; |
||||
const lines = yamlContent.split('\n'); |
||||
let currentKey: string | null = null; |
||||
let inArray = false; |
||||
let arrayKey: string | null = null; |
||||
|
||||
for (let i = 0; i < lines.length; i++) { |
||||
const line = lines[i]; |
||||
const trimmed = line.trim(); |
||||
|
||||
// Skip empty lines and comments
|
||||
if (!trimmed || trimmed.startsWith('#')) { |
||||
if (inArray && trimmed === '') { |
||||
// Empty line might end the array
|
||||
inArray = false; |
||||
arrayKey = null; |
||||
} |
||||
continue; |
||||
} |
||||
|
||||
// Array item (line starting with -)
|
||||
if (trimmed.startsWith('- ')) { |
||||
const item = trimmed.substring(2).trim(); |
||||
const cleanItem = item.replace(/^["']|["']$/g, ''); |
||||
|
||||
if (arrayKey && frontmatter[arrayKey]) { |
||||
frontmatter[arrayKey].push(cleanItem); |
||||
} else if (currentKey) { |
||||
// Start new array
|
||||
arrayKey = currentKey; |
||||
inArray = true; |
||||
frontmatter[currentKey] = [cleanItem]; |
||||
} |
||||
continue; |
||||
} |
||||
|
||||
// Key-value pair
|
||||
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/); |
||||
if (keyValueMatch) { |
||||
const key = keyValueMatch[1]; |
||||
let value = keyValueMatch[2].trim(); |
||||
|
||||
// Remove quotes if present
|
||||
if ((value.startsWith('"') && value.endsWith('"')) ||
|
||||
(value.startsWith("'") && value.endsWith("'"))) { |
||||
value = value.slice(1, -1); |
||||
} |
||||
|
||||
frontmatter[key] = value; |
||||
currentKey = key; |
||||
inArray = false; |
||||
arrayKey = null; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter }; |
||||
} |
||||
|
||||
// If no YAML front matter, try to extract AsciiDoc document header attributes
|
||||
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
|
||||
// Match header lines until we hit a blank line (which separates header from body)
|
||||
// The header consists of: title line, optional author/revision lines, and attribute lines
|
||||
const lines = content.split('\n'); |
||||
let headerEndIndex = 0; |
||||
|
||||
// Find where the header ends (first blank line after title/attributes)
|
||||
if (lines[0] && lines[0].match(/^=+\s+/)) { |
||||
// We have a title line, now find where header ends
|
||||
let i = 1; |
||||
// Skip author and revision lines (non-empty lines that don't start with :)
|
||||
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) { |
||||
i++; |
||||
} |
||||
// Now skip attribute lines (lines starting with :)
|
||||
while (i < lines.length && lines[i].trim().startsWith(':')) { |
||||
i++; |
||||
} |
||||
// Skip the blank line that separates header from body
|
||||
if (i < lines.length && lines[i].trim() === '') { |
||||
i++; |
||||
} |
||||
headerEndIndex = i; |
||||
} |
||||
|
||||
// If we found a header, extract it
|
||||
if (headerEndIndex > 0) { |
||||
const headerLines = lines.slice(0, headerEndIndex); |
||||
const headerContent = headerLines.join('\n'); |
||||
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n'); |
||||
|
||||
const frontmatter: Record<string, any> = {}; |
||||
const headerLinesArray = headerContent.split('\n'); |
||||
|
||||
// Extract title (first line starting with =)
|
||||
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/); |
||||
if (titleMatch) { |
||||
frontmatter.title = titleMatch[1].trim(); |
||||
} |
||||
|
||||
// Extract author (line after title, if it doesn't start with :)
|
||||
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) { |
||||
const authorLine = headerLinesArray[1].trim(); |
||||
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) { |
||||
// Not a revision line (which has numbers, commas, colons)
|
||||
frontmatter.author = authorLine; |
||||
} |
||||
} |
||||
|
||||
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
|
||||
for (let i = 1; i < headerLinesArray.length; i++) { |
||||
const line = headerLinesArray[i].trim(); |
||||
if (line.match(/^[\d.,\s:]+$/)) { |
||||
// This looks like a revision line
|
||||
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/); |
||||
if (revisionMatch) { |
||||
frontmatter.version = revisionMatch[1].trim(); |
||||
frontmatter.date = revisionMatch[2].trim(); |
||||
if (revisionMatch[3]) { |
||||
frontmatter.revision = revisionMatch[3].trim(); |
||||
} |
||||
} |
||||
break; |
||||
} |
||||
} |
||||
|
||||
// Extract AsciiDoc attributes (:key: value)
|
||||
for (const line of headerLinesArray) { |
||||
const trimmed = line.trim(); |
||||
if (trimmed.startsWith(':') && trimmed.includes(':')) { |
||||
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/); |
||||
if (attrMatch) { |
||||
const key = attrMatch[1].trim(); |
||||
let value = attrMatch[2].trim(); |
||||
|
||||
// Remove quotes if present
|
||||
if ((value.startsWith('"') && value.endsWith('"')) ||
|
||||
(value.startsWith("'") && value.endsWith("'"))) { |
||||
value = value.slice(1, -1); |
||||
} |
||||
|
||||
// Handle comma-separated values (like keywords)
|
||||
if (value.includes(',') && !value.includes(' ')) { |
||||
frontmatter[key] = value.split(',').map((v: string) => v.trim()); |
||||
} else { |
||||
frontmatter[key] = value; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
|
||||
// AsciiDoctor can work without the header, and we've already extracted the metadata
|
||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader }; |
||||
} |
||||
|
||||
// No front matter found
|
||||
return { content }; |
||||
} |
||||
@ -1,243 +0,0 @@
@@ -1,243 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.extractMetadata = extractMetadata; |
||||
/** |
||||
* Extracts metadata from content before processing |
||||
*/ |
||||
function extractMetadata(content, linkBaseURL) { |
||||
return { |
||||
nostrLinks: extractNostrLinks(content), |
||||
wikilinks: extractWikilinks(content), |
||||
hashtags: extractHashtags(content), |
||||
links: extractLinks(content, linkBaseURL), |
||||
media: extractMedia(content), |
||||
}; |
||||
} |
||||
/** |
||||
* Extract Nostr links from content |
||||
*/ |
||||
function extractNostrLinks(content) { |
||||
const nostrLinks = []; |
||||
const seen = new Set(); |
||||
// Extract nostr: prefixed links (valid bech32 format)
|
||||
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || []; |
||||
nostrMatches.forEach(match => { |
||||
const id = match.substring(6); // Remove 'nostr:'
|
||||
const type = getNostrType(id); |
||||
if (type && !seen.has(id)) { |
||||
seen.add(id); |
||||
nostrLinks.push({ |
||||
type, |
||||
id, |
||||
text: match, |
||||
bech32: id, |
||||
}); |
||||
} |
||||
}); |
||||
return nostrLinks; |
||||
} |
||||
/** |
||||
* Extract wikilinks from content |
||||
*/ |
||||
function extractWikilinks(content) { |
||||
const wikilinks = []; |
||||
const seen = new Set(); |
||||
// Match [[target]] or [[target|display]]
|
||||
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g; |
||||
let match; |
||||
while ((match = wikilinkPattern.exec(content)) !== null) { |
||||
const target = match[1].trim(); |
||||
const display = match[2] ? match[2].trim() : target; |
||||
const dtag = normalizeDtag(target); |
||||
const key = `${dtag}|${display}`; |
||||
if (!seen.has(key)) { |
||||
seen.add(key); |
||||
wikilinks.push({ |
||||
dtag, |
||||
display, |
||||
original: match[0], |
||||
}); |
||||
} |
||||
} |
||||
return wikilinks; |
||||
} |
||||
/** |
||||
* Extract hashtags from content |
||||
* Excludes hashtags in URLs, code blocks, and inline code |
||||
*/ |
||||
function extractHashtags(content) { |
||||
const hashtags = []; |
||||
const seen = new Set(); |
||||
// Remove code blocks first to avoid matching inside them
|
||||
const codeBlockPattern = /```[\s\S]*?```/g; |
||||
const inlineCodePattern = /`[^`]+`/g; |
||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
||||
let processedContent = content |
||||
.replace(codeBlockPattern, '') // Remove code blocks
|
||||
.replace(inlineCodePattern, '') // Remove inline code
|
||||
.replace(urlPattern, ''); // Remove URLs
|
||||
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
|
||||
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g; |
||||
let match; |
||||
while ((match = hashtagPattern.exec(processedContent)) !== null) { |
||||
const tag = match[1].toLowerCase(); |
||||
if (!seen.has(tag)) { |
||||
hashtags.push(tag); |
||||
seen.add(tag); |
||||
} |
||||
} |
||||
return hashtags; |
||||
} |
||||
/** |
||||
* Extract regular links from content |
||||
*/ |
||||
function extractLinks(content, linkBaseURL) { |
||||
const links = []; |
||||
const seen = new Set(); |
||||
// Extract markdown links: [text](url) - optimized to avoid double matching
|
||||
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g; |
||||
let markdownMatch; |
||||
while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) { |
||||
const [, text, url] = markdownMatch; |
||||
if (!seen.has(url) && !isNostrUrl(url)) { |
||||
seen.add(url); |
||||
links.push({ |
||||
url, |
||||
text, |
||||
isExternal: isExternalUrl(url, linkBaseURL), |
||||
}); |
||||
} |
||||
} |
||||
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
|
||||
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g; |
||||
let asciidocMatch; |
||||
while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) { |
||||
const [, url, text] = asciidocMatch; |
||||
if (!seen.has(url) && !isNostrUrl(url)) { |
||||
seen.add(url); |
||||
links.push({ |
||||
url, |
||||
text, |
||||
isExternal: isExternalUrl(url, linkBaseURL), |
||||
}); |
||||
} |
||||
} |
||||
// Extract raw URLs (basic pattern)
|
||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
||||
const rawUrls = content.match(urlPattern) || []; |
||||
rawUrls.forEach(url => { |
||||
if (!seen.has(url) && !isNostrUrl(url)) { |
||||
seen.add(url); |
||||
links.push({ |
||||
url, |
||||
text: url, |
||||
isExternal: isExternalUrl(url, linkBaseURL), |
||||
}); |
||||
} |
||||
}); |
||||
return links; |
||||
} |
||||
/** |
||||
* Extract media URLs from content |
||||
*/ |
||||
function extractMedia(content) { |
||||
const media = []; |
||||
const seen = new Set(); |
||||
// Extract markdown images:  - optimized to avoid double matching
|
||||
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g; |
||||
let markdownImageMatch; |
||||
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) { |
||||
const url = markdownImageMatch[1]; |
||||
if (url && !seen.has(url)) { |
||||
if (isImageUrl(url) || isVideoUrl(url)) { |
||||
media.push(url); |
||||
seen.add(url); |
||||
} |
||||
} |
||||
} |
||||
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
|
||||
const asciidocImagePattern = /image::([^\[]+)\[/g; |
||||
let asciidocImageMatch; |
||||
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) { |
||||
const url = asciidocImageMatch[1]; |
||||
if (url && !seen.has(url)) { |
||||
if (isImageUrl(url) || isVideoUrl(url)) { |
||||
media.push(url); |
||||
seen.add(url); |
||||
} |
||||
} |
||||
} |
||||
// Extract raw image/video URLs
|
||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
||||
const rawUrls = content.match(urlPattern) || []; |
||||
rawUrls.forEach(url => { |
||||
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) { |
||||
media.push(url); |
||||
seen.add(url); |
||||
} |
||||
}); |
||||
return media; |
||||
} |
||||
/** |
||||
* Get Nostr identifier type |
||||
*/ |
||||
function getNostrType(id) { |
||||
if (id.startsWith('npub')) |
||||
return 'npub'; |
||||
if (id.startsWith('nprofile')) |
||||
return 'nprofile'; |
||||
if (id.startsWith('nevent')) |
||||
return 'nevent'; |
||||
if (id.startsWith('naddr')) |
||||
return 'naddr'; |
||||
if (id.startsWith('note')) |
||||
return 'note'; |
||||
return null; |
||||
} |
||||
/** |
||||
* Normalize text to d-tag format |
||||
*/ |
||||
function normalizeDtag(text) { |
||||
return text |
||||
.toLowerCase() |
||||
.replace(/[^a-z0-9]+/g, '-') |
||||
.replace(/^-+|-+$/g, ''); |
||||
} |
||||
/** |
||||
* Check if URL is external |
||||
*/ |
||||
function isExternalUrl(url, linkBaseURL) { |
||||
if (!linkBaseURL) |
||||
return true; |
||||
try { |
||||
// Use a simple string-based check for Node.js compatibility
|
||||
// Extract hostname from URL string
|
||||
const urlMatch = url.match(/^https?:\/\/([^\/]+)/); |
||||
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
||||
if (urlMatch && baseMatch) { |
||||
return urlMatch[1] !== baseMatch[1]; |
||||
} |
||||
return true; |
||||
} |
||||
catch { |
||||
return true; |
||||
} |
||||
} |
||||
/** |
||||
* Check if URL is a Nostr URL |
||||
*/ |
||||
function isNostrUrl(url) { |
||||
return url.startsWith('nostr:') || getNostrType(url) !== null; |
||||
} |
||||
/** |
||||
* Check if URL is an image |
||||
*/ |
||||
function isImageUrl(url) { |
||||
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url); |
||||
} |
||||
/** |
||||
* Check if URL is a video |
||||
*/ |
||||
function isVideoUrl(url) { |
||||
return /\.(mp4|webm|ogg)$/i.test(url); |
||||
} |
||||
@ -1,396 +0,0 @@
@@ -1,396 +0,0 @@
|
||||
import { NostrLink, Wikilink } from '../types'; |
||||
|
||||
export interface ExtractedMetadata { |
||||
nostrLinks: NostrLink[]; |
||||
wikilinks: Wikilink[]; |
||||
hashtags: string[]; |
||||
links: Array<{ url: string; text: string; isExternal: boolean }>; |
||||
media: string[]; |
||||
} |
||||
|
||||
/** |
||||
* Extracts metadata from content before processing |
||||
*/ |
||||
export function extractMetadata(content: string, linkBaseURL: string): ExtractedMetadata { |
||||
return { |
||||
nostrLinks: extractNostrLinks(content), |
||||
wikilinks: extractWikilinks(content), |
||||
hashtags: extractHashtags(content), |
||||
links: extractLinks(content, linkBaseURL), |
||||
media: extractMedia(content), |
||||
}; |
||||
} |
||||
|
||||
/** |
||||
* Extract Nostr links from content |
||||
*/ |
||||
function extractNostrLinks(content: string): NostrLink[] { |
||||
const nostrLinks: NostrLink[] = []; |
||||
const seen = new Set<string>(); |
||||
|
||||
// Extract nostr: prefixed links (valid bech32 format)
|
||||
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || []; |
||||
nostrMatches.forEach(match => { |
||||
const id = match.substring(6); // Remove 'nostr:'
|
||||
const type = getNostrType(id); |
||||
if (type && !seen.has(id)) { |
||||
seen.add(id); |
||||
nostrLinks.push({ |
||||
type, |
||||
id, |
||||
text: match, |
||||
bech32: id, |
||||
}); |
||||
} |
||||
}); |
||||
|
||||
return nostrLinks; |
||||
} |
||||
|
||||
/** |
||||
* Extract wikilinks from content |
||||
*/ |
||||
function extractWikilinks(content: string): Wikilink[] { |
||||
const wikilinks: Wikilink[] = []; |
||||
const seen = new Set<string>(); |
||||
|
||||
// Match [[target]] or [[target|display]]
|
||||
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g; |
||||
let match; |
||||
|
||||
while ((match = wikilinkPattern.exec(content)) !== null) { |
||||
const target = match[1].trim(); |
||||
const display = match[2] ? match[2].trim() : target; |
||||
const dtag = normalizeDtag(target); |
||||
const key = `${dtag}|${display}`; |
||||
|
||||
if (!seen.has(key)) { |
||||
seen.add(key); |
||||
wikilinks.push({ |
||||
dtag, |
||||
display, |
||||
original: match[0], |
||||
}); |
||||
} |
||||
} |
||||
|
||||
return wikilinks; |
||||
} |
||||
|
||||
/** |
||||
* Extract hashtags from content |
||||
* Excludes hashtags in URLs, code blocks, and inline code |
||||
*/ |
||||
function extractHashtags(content: string): string[] { |
||||
const hashtags: string[] = []; |
||||
const seen = new Set<string>(); |
||||
|
||||
// Remove code blocks first to avoid matching inside them
|
||||
const codeBlockPattern = /```[\s\S]*?```/g; |
||||
const inlineCodePattern = /`[^`]+`/g; |
||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
||||
|
||||
let processedContent = content |
||||
.replace(codeBlockPattern, '') // Remove code blocks
|
||||
.replace(inlineCodePattern, '') // Remove inline code
|
||||
.replace(urlPattern, ''); // Remove URLs
|
||||
|
||||
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
|
||||
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g; |
||||
let match; |
||||
|
||||
while ((match = hashtagPattern.exec(processedContent)) !== null) { |
||||
const tag = match[1].toLowerCase(); |
||||
if (!seen.has(tag)) { |
||||
hashtags.push(tag); |
||||
seen.add(tag); |
||||
} |
||||
} |
||||
|
||||
return hashtags; |
||||
} |
||||
|
||||
/** |
||||
* Extract regular links from content |
||||
*/ |
||||
function extractLinks(content: string, linkBaseURL: string): Array<{ url: string; text: string; isExternal: boolean }> { |
||||
const links: Array<{ url: string; text: string; isExternal: boolean }> = []; |
||||
const seen = new Set<string>(); |
||||
|
||||
// Remove code blocks and inline code to avoid matching URLs inside them
|
||||
const codeBlockPattern = /```[\s\S]*?```/g; |
||||
const inlineCodePattern = /`[^`]+`/g; |
||||
let processedContent = content |
||||
.replace(codeBlockPattern, '') // Remove code blocks
|
||||
.replace(inlineCodePattern, ''); // Remove inline code
|
||||
|
||||
// Extract markdown links: [text](url) - but NOT images 
|
||||
// First, extract nested image links: [](link-url)
|
||||
// These should extract the outer link with the alt text
|
||||
// We also need to mark the inner image URL as seen so it doesn't get extracted as a raw URL
|
||||
const nestedImageLinkPattern = /\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)/g; |
||||
let nestedMatch; |
||||
const nestedImageUrls = new Set<string>(); // Track inner image URLs to exclude them
|
||||
while ((nestedMatch = nestedImageLinkPattern.exec(processedContent)) !== null) { |
||||
const [, altText, imageUrl, linkUrl] = nestedMatch; |
||||
const cleanLinkUrl = linkUrl.trim().replace(/[)\].,;:!?`]+$/, ''); |
||||
const cleanImageUrl = imageUrl.trim().replace(/[)\].,;:!?`]+$/, ''); |
||||
|
||||
// Mark the inner image URL as seen so it doesn't get extracted as a raw URL
|
||||
nestedImageUrls.add(cleanImageUrl); |
||||
// Also mark it in the seen set to prevent it from being extracted as a regular link
|
||||
seen.add(cleanImageUrl); |
||||
|
||||
if (cleanLinkUrl && cleanLinkUrl.match(/^https?:\/\//i) && !isNostrUrl(cleanLinkUrl) && !seen.has(cleanLinkUrl)) { |
||||
seen.add(cleanLinkUrl); |
||||
links.push({ |
||||
url: cleanLinkUrl, |
||||
text: altText.trim() || 'Image link', // Use the alt text from the image (e.g., "Youtube link with pic")
|
||||
isExternal: isExternalUrl(cleanLinkUrl, linkBaseURL), |
||||
}); |
||||
} |
||||
} |
||||
|
||||
// Now extract regular markdown links: [text](url) - but NOT images 
|
||||
// Use a pattern that explicitly excludes images by checking before the match
|
||||
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g; |
||||
let markdownMatch; |
||||
while ((markdownMatch = markdownLinkPattern.exec(processedContent)) !== null) { |
||||
// Check if this is an image (preceded by !)
|
||||
// We need to check the character immediately before the opening bracket
|
||||
const matchIndex = markdownMatch.index; |
||||
if (matchIndex > 0) { |
||||
const charBefore = processedContent[matchIndex - 1]; |
||||
if (charBefore === '!') { |
||||
continue; // Skip images - this is , not [text](url)
|
||||
} |
||||
} |
||||
|
||||
let [, text, url] = markdownMatch; |
||||
|
||||
// Skip if this is a nested image link (we already extracted those above)
|
||||
if (text.trim().startsWith(') { |
||||
continue; // Already handled by nestedImageLinkPattern
|
||||
} |
||||
|
||||
// Handle AsciiDoc image syntax in markdown links: [image::url[alt,width=100%]](link-url)
|
||||
// This happens when AsciiDoc content is converted to markdown-style links
|
||||
if (text.trim().startsWith('image::') || text.trim().startsWith('image:')) { |
||||
// Match image::url[alt,attributes] or image:url[alt,attributes]
|
||||
const imageMatch = text.match(/^image:?:[^\[]+\[([^\],]+)/); |
||||
if (imageMatch) { |
||||
text = imageMatch[1].trim(); // Use just the alt text (e.g., "Youtube link with pic")
|
||||
} else { |
||||
// If we can't extract alt text, use a default
|
||||
text = 'Image link'; |
||||
} |
||||
} |
||||
|
||||
// Clean up URL - remove trailing punctuation that might have been captured
|
||||
// But preserve parentheses that are part of the URL (like in query strings)
|
||||
// Only remove trailing punctuation that's clearly not part of the URL
|
||||
url = url.trim(); |
||||
|
||||
// Remove trailing punctuation that's likely not part of the URL
|
||||
// But be careful - URLs can end with ) if they're in markdown like [text](url))
|
||||
// We'll be conservative and only remove if it's clearly punctuation
|
||||
url = url.replace(/[)\].,;:!?`]+$/, ''); |
||||
|
||||
// Clean up text - remove stray punctuation and whitespace
|
||||
text = text.trim(); |
||||
|
||||
// Skip if URL is empty or invalid
|
||||
if (!url || !url.match(/^https?:\/\//i)) { |
||||
continue; |
||||
} |
||||
|
||||
if (!seen.has(url) && !isNostrUrl(url)) { |
||||
seen.add(url); |
||||
links.push({ |
||||
url, |
||||
text, |
||||
isExternal: isExternalUrl(url, linkBaseURL), |
||||
}); |
||||
} |
||||
} |
||||
|
||||
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
|
||||
// Handle nested image links: link:url[image::image-url[alt,width=100%]]
|
||||
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g; |
||||
let asciidocMatch; |
||||
while ((asciidocMatch = asciidocLinkPattern.exec(processedContent)) !== null) { |
||||
let [, url, text] = asciidocMatch; |
||||
|
||||
// Clean up URL
|
||||
url = url.trim(); |
||||
|
||||
// Handle nested image syntax in AsciiDoc: image::url[alt,width=100%]
|
||||
// Extract just the alt text from the image syntax
|
||||
if (text.trim().startsWith('image::') || text.trim().startsWith('image:')) { |
||||
// Match image::url[alt,attributes] or image:url[alt,attributes]
|
||||
const imageMatch = text.match(/^image:?:[^\[]+\[([^\],]+)/); |
||||
if (imageMatch) { |
||||
text = imageMatch[1].trim(); // Use just the alt text
|
||||
} else { |
||||
// If we can't extract alt text, skip this link (it's an image, not a text link)
|
||||
continue; |
||||
} |
||||
} |
||||
|
||||
// Clean up text
|
||||
text = text.trim(); |
||||
|
||||
// Skip if URL is empty or invalid
|
||||
if (!url || !url.match(/^https?:\/\//i)) { |
||||
continue; |
||||
} |
||||
|
||||
if (!seen.has(url) && !isNostrUrl(url)) { |
||||
seen.add(url); |
||||
links.push({ |
||||
url, |
||||
text, |
||||
isExternal: isExternalUrl(url, linkBaseURL), |
||||
}); |
||||
} |
||||
} |
||||
|
||||
// Extract raw URLs (basic pattern) - but exclude those already in markdown/asciidoc links
|
||||
// More restrictive pattern to avoid capturing trailing punctuation
|
||||
const urlPattern = /https?:\/\/[^\s<>"'`()\[\]]+/g; |
||||
const rawUrls = processedContent.match(urlPattern) || []; |
||||
rawUrls.forEach(url => { |
||||
// Remove trailing punctuation that might have been captured
|
||||
url = url.replace(/[)\].,;:!?`]+$/, ''); |
||||
|
||||
// Skip if URL is too short or invalid
|
||||
if (!url || url.length < 10 || !url.match(/^https?:\/\/[^\s]+$/i)) { |
||||
return; |
||||
} |
||||
|
||||
// Skip if this is an inner image URL from a nested image link
|
||||
if (nestedImageUrls.has(url)) { |
||||
return; |
||||
} |
||||
|
||||
if (!seen.has(url) && !isNostrUrl(url)) { |
||||
seen.add(url); |
||||
links.push({ |
||||
url, |
||||
text: url, |
||||
isExternal: isExternalUrl(url, linkBaseURL), |
||||
}); |
||||
} |
||||
}); |
||||
|
||||
return links; |
||||
} |
||||
|
||||
/** |
||||
* Extract media URLs from content |
||||
*/ |
||||
function extractMedia(content: string): string[] { |
||||
const media: string[] = []; |
||||
const seen = new Set<string>(); |
||||
|
||||
// Extract markdown images:  - optimized to avoid double matching
|
||||
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g; |
||||
let markdownImageMatch; |
||||
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) { |
||||
const url = markdownImageMatch[1]; |
||||
if (url && !seen.has(url)) { |
||||
if (isImageUrl(url) || isVideoUrl(url)) { |
||||
media.push(url); |
||||
seen.add(url); |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
|
||||
const asciidocImagePattern = /image::([^\[]+)\[/g; |
||||
let asciidocImageMatch; |
||||
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) { |
||||
const url = asciidocImageMatch[1]; |
||||
if (url && !seen.has(url)) { |
||||
if (isImageUrl(url) || isVideoUrl(url)) { |
||||
media.push(url); |
||||
seen.add(url); |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Extract raw image/video URLs
|
||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
||||
const rawUrls = content.match(urlPattern) || []; |
||||
rawUrls.forEach(url => { |
||||
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) { |
||||
media.push(url); |
||||
seen.add(url); |
||||
} |
||||
}); |
||||
|
||||
return media; |
||||
} |
||||
|
||||
/** |
||||
* Get Nostr identifier type |
||||
*/ |
||||
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null { |
||||
if (id.startsWith('npub')) return 'npub'; |
||||
if (id.startsWith('nprofile')) return 'nprofile'; |
||||
if (id.startsWith('nevent')) return 'nevent'; |
||||
if (id.startsWith('naddr')) return 'naddr'; |
||||
if (id.startsWith('note')) return 'note'; |
||||
return null; |
||||
} |
||||
|
||||
/** |
||||
* Normalize text to d-tag format |
||||
*/ |
||||
function normalizeDtag(text: string): string { |
||||
return text |
||||
.toLowerCase() |
||||
.replace(/[^a-z0-9]+/g, '-') |
||||
.replace(/^-+|-+$/g, ''); |
||||
} |
||||
|
||||
/** |
||||
* Check if URL is external |
||||
*/ |
||||
function isExternalUrl(url: string, linkBaseURL: string): boolean { |
||||
if (!linkBaseURL) return true; |
||||
try { |
||||
// Use a simple string-based check for Node.js compatibility
|
||||
// Extract hostname from URL string
|
||||
const urlMatch = url.match(/^https?:\/\/([^\/]+)/); |
||||
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
||||
|
||||
if (urlMatch && baseMatch) { |
||||
return urlMatch[1] !== baseMatch[1]; |
||||
} |
||||
return true; |
||||
} catch { |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Check if URL is a Nostr URL |
||||
*/ |
||||
function isNostrUrl(url: string): boolean { |
||||
return url.startsWith('nostr:') || getNostrType(url) !== null; |
||||
} |
||||
|
||||
/** |
||||
* Check if URL is an image |
||||
*/ |
||||
function isImageUrl(url: string): boolean { |
||||
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url); |
||||
} |
||||
|
||||
/** |
||||
* Check if URL is a video |
||||
*/ |
||||
function isVideoUrl(url: string): boolean { |
||||
return /\.(mp4|webm|ogg)$/i.test(url); |
||||
} |
||||
@ -1,92 +0,0 @@
@@ -1,92 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.Parser = void 0; |
||||
exports.defaultOptions = defaultOptions; |
||||
exports.process = process; |
||||
const detector_1 = require("./detector"); |
||||
const to_asciidoc_1 = require("./converters/to-asciidoc"); |
||||
const asciidoc_1 = require("./processors/asciidoc"); |
||||
const metadata_1 = require("./extractors/metadata"); |
||||
const frontmatter_1 = require("./extractors/frontmatter"); |
||||
/** |
||||
* Default parser options |
||||
*/ |
||||
function defaultOptions() { |
||||
return { |
||||
linkBaseURL: '', |
||||
enableAsciiDoc: true, |
||||
enableMarkdown: true, |
||||
enableCodeHighlighting: true, |
||||
enableLaTeX: true, |
||||
enableMusicalNotation: true, |
||||
enableNostrAddresses: true, |
||||
}; |
||||
} |
||||
/** |
||||
* Main parser for Nostr event content |
||||
* Handles multiple content formats: AsciiDoc, Markdown, code syntax, |
||||
* LaTeX, musical notation, and nostr: prefixed addresses |
||||
* |
||||
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor |
||||
*/ |
||||
class Parser { |
||||
constructor(options = {}) { |
||||
const defaults = defaultOptions(); |
||||
this.options = { |
||||
linkBaseURL: options.linkBaseURL ?? defaults.linkBaseURL ?? '', |
||||
enableAsciiDoc: options.enableAsciiDoc ?? defaults.enableAsciiDoc ?? true, |
||||
enableMarkdown: options.enableMarkdown ?? defaults.enableMarkdown ?? true, |
||||
enableCodeHighlighting: options.enableCodeHighlighting ?? defaults.enableCodeHighlighting ?? true, |
||||
enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true, |
||||
enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true, |
||||
enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true, |
||||
wikilinkUrl: options.wikilinkUrl ?? defaults.wikilinkUrl, |
||||
hashtagUrl: options.hashtagUrl ?? defaults.hashtagUrl, |
||||
}; |
||||
} |
||||
/** |
||||
* Process Nostr event content and return HTML |
||||
* Automatically detects the content format and processes accordingly |
||||
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor |
||||
*/ |
||||
async process(content) { |
||||
// Extract frontmatter first (before any other processing)
|
||||
const { frontmatter, content: contentWithoutFrontmatter } = (0, frontmatter_1.extractFrontmatter)(content); |
||||
// Extract metadata from content (after removing frontmatter)
|
||||
const metadata = (0, metadata_1.extractMetadata)(contentWithoutFrontmatter, this.options.linkBaseURL); |
||||
// Detect content format (on content without frontmatter)
|
||||
const format = (0, detector_1.detectFormat)(contentWithoutFrontmatter); |
||||
// Convert everything to AsciiDoc format first
|
||||
const asciidocContent = (0, to_asciidoc_1.convertToAsciidoc)(contentWithoutFrontmatter, format, this.options.linkBaseURL, { |
||||
enableNostrAddresses: this.options.enableNostrAddresses, |
||||
}); |
||||
// Process through AsciiDoctor
|
||||
const result = await (0, asciidoc_1.processAsciidoc)(asciidocContent, { |
||||
enableCodeHighlighting: this.options.enableCodeHighlighting, |
||||
enableLaTeX: this.options.enableLaTeX, |
||||
enableMusicalNotation: this.options.enableMusicalNotation, |
||||
originalContent: contentWithoutFrontmatter, // Pass original for LaTeX detection
|
||||
linkBaseURL: this.options.linkBaseURL, // Pass linkBaseURL for link processing
|
||||
wikilinkUrl: this.options.wikilinkUrl, // Pass wikilink URL format
|
||||
hashtagUrl: this.options.hashtagUrl, // Pass hashtag URL format
|
||||
}); |
||||
// Combine with extracted metadata and frontmatter
|
||||
return { |
||||
...result, |
||||
frontmatter, |
||||
nostrLinks: metadata.nostrLinks, |
||||
wikilinks: metadata.wikilinks, |
||||
hashtags: metadata.hashtags, |
||||
links: metadata.links, |
||||
media: metadata.media, |
||||
}; |
||||
} |
||||
} |
||||
exports.Parser = Parser; |
||||
/** |
||||
* Convenience function to process content with default options |
||||
*/ |
||||
async function process(content, options) { |
||||
const parser = new Parser(options); |
||||
return parser.process(content); |
||||
} |
||||
@ -0,0 +1,481 @@
@@ -0,0 +1,481 @@
|
||||
import { ParserOptions, NostrLink, Wikilink } from './types'; |
||||
|
||||
/** |
||||
* Extract and process wikilinks, hashtags, and nostr: addresses from HTML |
||||
*/ |
||||
export interface PostProcessResult { |
||||
html: string; |
||||
nostrLinks: NostrLink[]; |
||||
wikilinks: Wikilink[]; |
||||
hashtags: string[]; |
||||
} |
||||
|
||||
/** |
||||
* Post-process HTML to convert wikilinks, hashtags, and nostr: addresses |
||||
* @param skipWikilinksAndHashtags - If true, skip processing wikilinks and hashtags (already processed) |
||||
*/ |
||||
export function postProcess(html: string, options: ParserOptions, skipWikilinksAndHashtags: boolean = false): PostProcessResult { |
||||
let processed = html; |
||||
const nostrLinks: NostrLink[] = []; |
||||
const wikilinks: Wikilink[] = []; |
||||
const hashtags: string[] = []; |
||||
|
||||
// First, mark code blocks to avoid processing inside them
|
||||
const codeBlockMarkers: Array<{ start: number; end: number }> = []; |
||||
const codeBlockRegex = /<(pre|code)[^>]*>[\s\S]*?<\/\1>/gi; |
||||
let match; |
||||
while ((match = codeBlockRegex.exec(html)) !== null) { |
||||
codeBlockMarkers.push({ start: match.index, end: match.index + match[0].length }); |
||||
} |
||||
|
||||
function isInCodeBlock(index: number): boolean { |
||||
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end); |
||||
} |
||||
|
||||
// Process nostr: addresses (but not in code blocks)
|
||||
if (options.enableNostrAddresses !== false) { |
||||
const nostrRegex = /nostr:([np][a-z0-9]+1[a-z0-9]+)/gi; |
||||
const replacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||
|
||||
while ((match = nostrRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
|
||||
const bech32 = match[1]; |
||||
const type = getNostrType(bech32); |
||||
if (!type) continue; |
||||
|
||||
const link: NostrLink = { |
||||
type, |
||||
id: bech32, |
||||
text: match[0], |
||||
bech32: bech32 |
||||
}; |
||||
nostrLinks.push(link); |
||||
|
||||
const url = options.linkBaseURL
|
||||
? `${options.linkBaseURL}/nostr/${bech32}` |
||||
: `#nostr-${bech32}`; |
||||
|
||||
replacements.push({ |
||||
match: match[0], |
||||
replacement: `<a href="${escapeHtml(url)}" class="nostr-link" data-nostr-type="${type}" data-nostr-id="${escapeHtml(bech32)}">${escapeHtml(match[0])}</a>`, |
||||
index: match.index |
||||
}); |
||||
} |
||||
|
||||
// Apply replacements in reverse order to preserve indices
|
||||
replacements.reverse().forEach(({ match, replacement, index }) => { |
||||
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||
}); |
||||
} |
||||
|
||||
// Process wikilinks: [[dtag]] or [[dtag|display]] (but not in code blocks)
|
||||
// Skip if already processed (for AsciiDoc)
|
||||
if (!skipWikilinksAndHashtags) { |
||||
const wikilinkRegex = /\[\[([^\]]+)\]\]/g; |
||||
const wikilinkReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||
|
||||
while ((match = wikilinkRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
|
||||
// Skip if already inside a link tag
|
||||
const beforeMatch = processed.substring(0, match.index); |
||||
const lastOpenTag = beforeMatch.lastIndexOf('<a'); |
||||
const lastCloseTag = beforeMatch.lastIndexOf('</a>'); |
||||
if (lastOpenTag > lastCloseTag) continue; // Inside a link
|
||||
|
||||
const content = match[1]; |
||||
const parts = content.split('|'); |
||||
const dtag = parts[0].trim(); |
||||
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag; |
||||
|
||||
const wikilink: Wikilink = { |
||||
dtag, |
||||
display, |
||||
original: match[0] |
||||
}; |
||||
wikilinks.push(wikilink); |
||||
|
||||
let url: string; |
||||
if (typeof options.wikilinkUrl === 'function') { |
||||
url = options.wikilinkUrl(dtag); |
||||
} else if (typeof options.wikilinkUrl === 'string') { |
||||
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(dtag)); |
||||
} else { |
||||
url = options.linkBaseURL
|
||||
? `${options.linkBaseURL}/events?d=${encodeURIComponent(dtag)}` |
||||
: `#${encodeURIComponent(dtag)}`; |
||||
} |
||||
|
||||
wikilinkReplacements.push({ |
||||
match: match[0], |
||||
replacement: `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(dtag)}">${escapeHtml(display)}</a>`, |
||||
index: match.index |
||||
}); |
||||
} |
||||
|
||||
// Apply wikilink replacements in reverse order
|
||||
wikilinkReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||
}); |
||||
|
||||
// Process hashtags: #hashtag (but not in code blocks or inside HTML tags)
|
||||
const hashtagRegex = /(^|\s|>)(#[\w-]+)/g; |
||||
const hashtagReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||
|
||||
while ((match = hashtagRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
|
||||
// Check if we're inside an HTML tag
|
||||
const beforeMatch = processed.substring(0, match.index); |
||||
const lastOpenTag = beforeMatch.lastIndexOf('<'); |
||||
const lastCloseTag = beforeMatch.lastIndexOf('>'); |
||||
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
|
||||
|
||||
// Skip if already inside a link or span
|
||||
const lastLinkOpen = beforeMatch.lastIndexOf('<a'); |
||||
const lastLinkClose = beforeMatch.lastIndexOf('</a>'); |
||||
const lastSpanOpen = beforeMatch.lastIndexOf('<span'); |
||||
const lastSpanClose = beforeMatch.lastIndexOf('</span>'); |
||||
if (lastLinkOpen > lastLinkClose || lastSpanOpen > lastSpanClose) continue; |
||||
|
||||
const hashtag = match[2]; |
||||
const prefix = match[1]; |
||||
const topic = hashtag.substring(1); |
||||
|
||||
if (!hashtags.includes(topic)) { |
||||
hashtags.push(topic); |
||||
} |
||||
|
||||
let url: string | undefined; |
||||
if (typeof options.hashtagUrl === 'function') { |
||||
url = options.hashtagUrl(topic); |
||||
} else if (typeof options.hashtagUrl === 'string') { |
||||
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic)); |
||||
} |
||||
|
||||
const replacement = url |
||||
? `${prefix}<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>` |
||||
: `${prefix}<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`; |
||||
|
||||
hashtagReplacements.push({ |
||||
match: match[0], |
||||
replacement, |
||||
index: match.index |
||||
}); |
||||
} |
||||
|
||||
// Apply hashtag replacements in reverse order
|
||||
hashtagReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||
}); |
||||
} |
||||
|
||||
// Extract wikilinks and hashtags from already-processed HTML (for AsciiDoc)
|
||||
if (skipWikilinksAndHashtags) { |
||||
// Extract wikilinks from existing links
|
||||
const wikilinkLinkRegex = /<a[^>]+class="wikilink"[^>]+data-dtag="([^"]+)"[^>]*>([^<]+)<\/a>/g; |
||||
while ((match = wikilinkLinkRegex.exec(processed)) !== null) { |
||||
wikilinks.push({ |
||||
dtag: match[1], |
||||
display: match[2], |
||||
original: match[0] |
||||
}); |
||||
} |
||||
|
||||
// Extract hashtags from existing spans/links
|
||||
const hashtagRegex = /<(?:a|span)[^>]+class="hashtag"[^>]+data-topic="([^"]+)"[^>]*>#\1<\/\w+>/g; |
||||
while ((match = hashtagRegex.exec(processed)) !== null) { |
||||
const topic = match[1]; |
||||
if (!hashtags.includes(topic)) { |
||||
hashtags.push(topic); |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Remove links inside code blocks (both <code> and <pre> tags)
|
||||
// This ensures URLs in code blocks remain as plain text
|
||||
const codeBlockLinkRegex = /(<(?:code|pre)[^>]*>)([\s\S]*?)(<\/(?:code|pre)>)/gi; |
||||
processed = processed.replace(codeBlockLinkRegex, (match, openTag, content, closeTag) => { |
||||
// Remove all <a> tags inside code blocks, keeping only the text content
|
||||
const cleanedContent = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1'); |
||||
return openTag + cleanedContent + closeTag; |
||||
}); |
||||
|
||||
// Process YouTube URLs - ORDER IS CRITICAL to avoid double-parsing
|
||||
// 1. FIRST: Fix video tags that contain YouTube URLs (before they get processed as bare URLs)
|
||||
// AsciiDoc's video:: macro creates <video> tags, but YouTube URLs should be iframes
|
||||
const youtubeVideoTagRegex = /<video[^>]+src="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>[\s\S]*?<\/video>/gi; |
||||
processed = processed.replace(youtubeVideoTagRegex, (match, url, videoId) => { |
||||
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||
}); |
||||
|
||||
// 2. SECOND: Process YouTube links in <a> tags
|
||||
// IMPORTANT: Be very specific with YouTube regex to avoid matching Spotify URLs
|
||||
const youtubeLinkRegex = /<a[^>]+href="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>.*?<\/a>/gi; |
||||
processed = processed.replace(youtubeLinkRegex, (match, url, videoId) => { |
||||
if (isInCodeBlock(processed.indexOf(match))) return match; |
||||
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||
}); |
||||
|
||||
// 3. THIRD: Fix malformed YouTube iframes from AsciiDoc video:: macro
|
||||
// AsciiDoc sometimes creates iframes with malformed YouTube URLs (watch?v= or shorts/ instead of embed/)
|
||||
// Match the entire iframe element including closing tag to avoid duplicates
|
||||
const malformedYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube[^"]*(?:watch\?v=|shorts\/)([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi; |
||||
processed = processed.replace(malformedYoutubeIframeRegex, (match, videoId) => { |
||||
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||
}); |
||||
|
||||
// 3.5: Fix YouTube iframes with embed URLs but wrong parameters or missing required attributes
|
||||
// AsciiDoc's video:: macro creates iframes with ?rel=0 or missing allow/referrerpolicy attributes
|
||||
// Match iframes with embed URLs that don't have enablejsapi=1 or are missing required attributes
|
||||
const incompleteYoutubeIframeRegex = /<iframe[^>]+src="https?:\/\/(?:www\.)?youtube\.com\/embed\/([a-zA-Z0-9_-]+)(\?[^"]*)?"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi; |
||||
processed = processed.replace(incompleteYoutubeIframeRegex, (match, videoId, params) => { |
||||
// Check if this iframe already has the correct format (has enablejsapi=1 and required attributes)
|
||||
if (match.includes('enablejsapi=1') &&
|
||||
match.includes('allow=') &&
|
||||
match.includes('referrerpolicy=') && |
||||
match.includes('class="youtube-embed"')) { |
||||
return match; // Already correct, don't modify
|
||||
} |
||||
// Fix the iframe with proper attributes
|
||||
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||
}); |
||||
|
||||
// 4. FOURTH: Fix any existing YouTube iframes that have malformed embed URLs (AsciiDoc sometimes creates broken embed URLs)
|
||||
// Match the entire iframe element including closing tag to avoid duplicates
|
||||
const brokenYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube\.com\/embed\/[^"]*watch\?v=([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi; |
||||
processed = processed.replace(brokenYoutubeIframeRegex, (match, videoId) => { |
||||
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||
}); |
||||
|
||||
// 5. LAST: Handle bare YouTube URLs (not in links, video tags, or iframes)
|
||||
// IMPORTANT: Match must be specific to youtube.com or youtu.be to avoid matching Spotify
|
||||
// This must come AFTER processing video tags and links to avoid double-parsing
|
||||
const bareYoutubeRegex = /(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)(?:\?[^"\s<>]*)?)/gi; |
||||
const youtubeReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||
while ((match = bareYoutubeRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
|
||||
// Check if it's already in a tag (link, iframe, video, etc.)
|
||||
// Simple approach: check if we're inside quotes (attribute value) or between <tag and >
|
||||
const before = processed.substring(Math.max(0, match.index - 500), match.index); |
||||
const after = processed.substring(match.index, match.index + match[0].length + 100); |
||||
|
||||
// Check if URL is inside quotes (attribute value like src="..." or href="...")
|
||||
const beforeContext = before.substring(Math.max(0, before.length - 100)); |
||||
if (beforeContext.match(/<(iframe|video|a|img|audio|source)[^>]*\s+(src|href)="[^"]*$/i)) { |
||||
continue; // Inside an attribute value, skip
|
||||
} |
||||
|
||||
// Check if we're between an opening tag and its closing bracket
|
||||
const lastOpenTag = before.lastIndexOf('<'); |
||||
const lastCloseBracket = before.lastIndexOf('>'); |
||||
if (lastOpenTag > lastCloseBracket) { |
||||
// We're inside a tag, check what kind
|
||||
const tagContent = before.substring(lastOpenTag); |
||||
if (/<(iframe|video|a|img|audio|source)[^>]*$/i.test(tagContent)) { |
||||
continue; // Skip URLs inside these tags
|
||||
} |
||||
} |
||||
|
||||
const videoId = match[2]; |
||||
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||
youtubeReplacements.push({ |
||||
match: match[0], |
||||
replacement: `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`, |
||||
index: match.index |
||||
}); |
||||
} |
||||
youtubeReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||
}); |
||||
|
||||
// Fix double-closed iframes (safety net)
|
||||
processed = processed.replace(/<\/iframe><\/iframe>/gi, '</iframe>'); |
||||
|
||||
// Spotify: https://open.spotify.com/episode/ID or https://open.spotify.com/track/ID or https://open.spotify.com/album/ID
|
||||
const spotifyLinkRegex = /<a[^>]+href="(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+))[^"]*"[^>]*>.*?<\/a>/gi; |
||||
processed = processed.replace(spotifyLinkRegex, (match, url, type, id) => { |
||||
if (isInCodeBlock(processed.indexOf(match))) return match; |
||||
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`; |
||||
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`; |
||||
}); |
||||
|
||||
// Also handle bare Spotify URLs (not in links)
|
||||
const bareSpotifyRegex = /(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)(?:\?[^"\s<>]*)?)/gi; |
||||
const spotifyReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||
while ((match = bareSpotifyRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
// Check if it's already in a tag
|
||||
const before = processed.substring(0, match.index); |
||||
const lastOpenTag = before.lastIndexOf('<'); |
||||
const lastCloseTag = before.lastIndexOf('>'); |
||||
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
|
||||
|
||||
const type = match[2]; |
||||
const id = match[3]; |
||||
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`; |
||||
spotifyReplacements.push({ |
||||
match: match[0], |
||||
replacement: `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`, |
||||
index: match.index |
||||
}); |
||||
} |
||||
spotifyReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||
}); |
||||
|
||||
// Process bare image/media URLs that aren't already in tags
|
||||
// First, convert bare links (class="bare") that contain image/video/audio URLs to actual media elements
|
||||
// This handles cases where AsciiDoc has already converted URLs to links
|
||||
// IMPORTANT: Check YouTube FIRST, then Spotify, BEFORE checking file extensions to avoid conflicts
|
||||
const bareLinkRegex = /<a[^>]+href="(https?:\/\/[^"]+)"[^>]*class="[^"]*bare[^"]*"[^>]*>([^<]*)<\/a>/gi; |
||||
processed = processed.replace(bareLinkRegex, (match, url, linkText) => { |
||||
if (isInCodeBlock(processed.indexOf(match))) return match; |
||||
|
||||
// Check YouTube URLs FIRST (be very specific - must be youtube.com or youtu.be)
|
||||
// This prevents accidentally matching Spotify URLs
|
||||
const youtubeMatch = url.match(/https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)/); |
||||
if (youtubeMatch) { |
||||
const videoId = youtubeMatch[1]; |
||||
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||
} |
||||
|
||||
// Check Spotify URLs (be very specific - must be open.spotify.com)
|
||||
const spotifyMatch = url.match(/https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)/); |
||||
if (spotifyMatch) { |
||||
const type = spotifyMatch[1]; |
||||
const id = spotifyMatch[2]; |
||||
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`; |
||||
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`; |
||||
} |
||||
|
||||
// Check if it's an image URL
|
||||
if (/\.(jpg|jpeg|png|gif|webp|svg|bmp)(\?|$)/i.test(url)) { |
||||
return `<img src="${escapeHtml(url)}" alt="${escapeHtml(linkText)}" class="bare-image" />`; |
||||
} |
||||
// Check if it's a video URL (but not YouTube)
|
||||
if (/\.(mp4|webm|ogg|mov|avi)(\?|$)/i.test(url)) { |
||||
return `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`; |
||||
} |
||||
// Check if it's an audio URL (but not Spotify)
|
||||
if (/\.(mp3|wav|ogg|flac|aac|m4a)(\?|$)/i.test(url)) { |
||||
return `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`; |
||||
} |
||||
|
||||
// Not a media URL, return as-is
|
||||
return match; |
||||
}); |
||||
|
||||
// Now process bare URLs that aren't in any tags at all
|
||||
// IMPORTANT: Skip YouTube and Spotify URLs - they're already processed above
|
||||
const imageUrlRegex = /(https?:\/\/[^\s<>"']+\.(jpg|jpeg|png|gif|webp|svg|bmp))(?![^<]*>)/gi; |
||||
const videoUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp4|webm|ogg|mov|avi))(?![^<]*>)/gi; |
||||
const audioUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp3|wav|ogg|flac|aac|m4a))(?![^<]*>)/gi; |
||||
|
||||
// Check if URL is already in a tag
|
||||
function isUrlInTag(url: string, index: number): boolean { |
||||
const before = processed.substring(0, index); |
||||
const after = processed.substring(index); |
||||
|
||||
// Check if it's inside an existing tag
|
||||
const lastOpenTag = before.lastIndexOf('<'); |
||||
const lastCloseTag = before.lastIndexOf('>'); |
||||
if (lastOpenTag > lastCloseTag) { |
||||
const tagContent = processed.substring(lastOpenTag, index + url.length); |
||||
if (/<(img|video|audio|a|source|iframe)[^>]*>/i.test(tagContent)) { |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
return false; |
||||
} |
||||
|
||||
const mediaReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||
|
||||
// Process images
|
||||
while ((match = imageUrlRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
if (isUrlInTag(match[0], match.index)) continue; |
||||
|
||||
const url = match[0]; |
||||
mediaReplacements.push({ |
||||
match: url, |
||||
replacement: `<img src="${escapeHtml(url)}" alt="" class="bare-image" />`, |
||||
index: match.index |
||||
}); |
||||
} |
||||
|
||||
// Process videos (but skip YouTube URLs - they're handled above)
|
||||
while ((match = videoUrlRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
if (isUrlInTag(match[0], match.index)) continue; |
||||
// Skip YouTube URLs - they should be embeds, not video tags
|
||||
if (/youtube\.com|youtu\.be/i.test(match[0])) continue; |
||||
|
||||
const url = match[0]; |
||||
mediaReplacements.push({ |
||||
match: url, |
||||
replacement: `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`, |
||||
index: match.index |
||||
}); |
||||
} |
||||
|
||||
// Process audio
|
||||
while ((match = audioUrlRegex.exec(processed)) !== null) { |
||||
if (isInCodeBlock(match.index)) continue; |
||||
if (isUrlInTag(match[0], match.index)) continue; |
||||
|
||||
const url = match[0]; |
||||
mediaReplacements.push({ |
||||
match: url, |
||||
replacement: `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`, |
||||
index: match.index |
||||
}); |
||||
} |
||||
|
||||
// Apply media replacements in reverse order
|
||||
mediaReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||
}); |
||||
|
||||
return { |
||||
html: processed, |
||||
nostrLinks, |
||||
wikilinks, |
||||
hashtags |
||||
}; |
||||
} |
||||
|
||||
/** |
||||
* Get Nostr identifier type from bech32 string |
||||
*/ |
||||
function getNostrType(bech32: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null { |
||||
if (bech32.startsWith('npub')) return 'npub'; |
||||
if (bech32.startsWith('nprofile')) return 'nprofile'; |
||||
if (bech32.startsWith('nevent')) return 'nevent'; |
||||
if (bech32.startsWith('naddr')) return 'naddr'; |
||||
if (bech32.startsWith('note')) return 'note'; |
||||
return null; |
||||
} |
||||
|
||||
/** |
||||
* Escape HTML special characters |
||||
*/ |
||||
function escapeHtml(text: string): string { |
||||
const map: Record<string, string> = { |
||||
'&': '&', |
||||
'<': '<', |
||||
'>': '>', |
||||
'"': '"', |
||||
"'": ''' |
||||
}; |
||||
return text.replace(/[&<>"']/g, (m) => map[m]); |
||||
} |
||||
@ -0,0 +1,175 @@
@@ -0,0 +1,175 @@
|
||||
import { ParserOptions, Wikilink } from './types'; |
||||
import * as emoji from 'node-emoji'; |
||||
|
||||
/** |
||||
* Pre-process raw content to handle wikilinks and hashtags before AsciiDoc conversion |
||||
* This prevents AsciiDoc from converting them to anchors or other formats |
||||
*/ |
||||
export interface PreProcessResult { |
||||
content: string; |
||||
wikilinks: Wikilink[]; |
||||
hashtags: string[]; |
||||
} |
||||
|
||||
/** |
||||
* Pre-process content to convert wikilinks and hashtags to placeholders |
||||
* that will be processed after HTML conversion |
||||
*/ |
||||
export function preProcessAsciiDoc(content: string, options: ParserOptions): PreProcessResult { |
||||
let processed = content; |
||||
const wikilinks: Wikilink[] = []; |
||||
const hashtags: string[] = []; |
||||
|
||||
// Process emojis first
|
||||
processed = emoji.emojify(processed); |
||||
|
||||
// Process wikilinks: [[dtag]] or [[dtag|display]]
|
||||
// Replace with a placeholder that AsciiDoc won't touch
|
||||
const wikilinkRegex = /\[\[([^\]]+)\]\]/g; |
||||
const wikilinkPlaceholders: Map<string, Wikilink> = new Map(); |
||||
let placeholderCounter = 0; |
||||
|
||||
processed = processed.replace(wikilinkRegex, (match, content) => { |
||||
const parts = content.split('|'); |
||||
const dtag = parts[0].trim(); |
||||
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag; |
||||
|
||||
const wikilink: Wikilink = { |
||||
dtag, |
||||
display, |
||||
original: match |
||||
}; |
||||
wikilinks.push(wikilink); |
||||
|
||||
// Use a unique placeholder that won't be processed by AsciiDoc
|
||||
// Use angle brackets to avoid AsciiDoc formatting interpretation
|
||||
const placeholder = `<WIKILINK_PLACEHOLDER_${placeholderCounter}>`; |
||||
wikilinkPlaceholders.set(placeholder, wikilink); |
||||
placeholderCounter++; |
||||
|
||||
return placeholder; |
||||
}); |
||||
|
||||
// Process hashtags: #hashtag (but not in code blocks)
|
||||
// Mark code blocks first
|
||||
const codeBlockMarkers: Array<{ start: number; end: number }> = []; |
||||
const codeBlockRegex = /\[source,[^\]]+\]|\[abc\]|\[plantuml\]|```|`[^`]+`/g; |
||||
let match; |
||||
while ((match = codeBlockRegex.exec(processed)) !== null) { |
||||
// Find the end of the code block
|
||||
const start = match.index; |
||||
let end = start + match[0].length; |
||||
|
||||
// For source blocks, find the closing ----
|
||||
if (match[0].startsWith('[source')) { |
||||
const afterStart = processed.substring(end); |
||||
const closeMatch = afterStart.match(/^[\s\S]*?----/); |
||||
if (closeMatch) { |
||||
end = start + match[0].length + closeMatch[0].length; |
||||
} |
||||
} |
||||
|
||||
codeBlockMarkers.push({ start, end }); |
||||
} |
||||
|
||||
function isInCodeBlock(index: number): boolean { |
||||
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end); |
||||
} |
||||
|
||||
// Process hashtags
|
||||
const hashtagPlaceholders: Map<string, string> = new Map(); |
||||
let hashtagCounter = 0; |
||||
|
||||
// Match hashtags at start of line, after whitespace, or after > (for blockquotes)
|
||||
const hashtagRegex = /(^|\s|>)(#[\w-]+)/gm; |
||||
|
||||
processed = processed.replace(hashtagRegex, (match, prefix, hashtag, offset) => { |
||||
if (isInCodeBlock(offset)) return match; |
||||
|
||||
const topic = hashtag.substring(1); |
||||
if (!hashtags.includes(topic)) { |
||||
hashtags.push(topic); |
||||
} |
||||
|
||||
// Use angle brackets to avoid AsciiDoc formatting interpretation
|
||||
const placeholder = `<HASHTAG_PLACEHOLDER_${hashtagCounter}>`; |
||||
hashtagPlaceholders.set(placeholder, topic); |
||||
hashtagCounter++; |
||||
|
||||
return `${prefix}${placeholder}`; |
||||
}); |
||||
|
||||
return { |
||||
content: processed, |
||||
wikilinks, |
||||
hashtags |
||||
}; |
||||
} |
||||
|
||||
/** |
||||
* Restore wikilinks and hashtags from placeholders in HTML |
||||
*/ |
||||
export function restorePlaceholders( |
||||
html: string, |
||||
wikilinks: Wikilink[], |
||||
hashtags: string[], |
||||
options: ParserOptions |
||||
): string { |
||||
let processed = html; |
||||
|
||||
// Restore wikilinks (handle both escaped and unescaped placeholders)
|
||||
const wikilinkPlaceholderRegex = /<WIKILINK_PLACEHOLDER_(\d+)>|<WIKILINK_PLACEHOLDER_(\d+)>/g; |
||||
processed = processed.replace(wikilinkPlaceholderRegex, (match, escapedIndex, unescapedIndex) => { |
||||
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex); |
||||
const wikilink = wikilinks[index]; |
||||
if (!wikilink) return match; |
||||
|
||||
let url: string; |
||||
if (typeof options.wikilinkUrl === 'function') { |
||||
url = options.wikilinkUrl(wikilink.dtag); |
||||
} else if (typeof options.wikilinkUrl === 'string') { |
||||
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(wikilink.dtag)); |
||||
} else { |
||||
url = options.linkBaseURL
|
||||
? `${options.linkBaseURL}/events?d=${encodeURIComponent(wikilink.dtag)}` |
||||
: `#${encodeURIComponent(wikilink.dtag)}`; |
||||
} |
||||
|
||||
return `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(wikilink.dtag)}">${escapeHtml(wikilink.display)}</a>`; |
||||
}); |
||||
|
||||
// Restore hashtags (handle both escaped and unescaped placeholders)
|
||||
const hashtagPlaceholderRegex = /<HASHTAG_PLACEHOLDER_(\d+)>|<HASHTAG_PLACEHOLDER_(\d+)>/g; |
||||
processed = processed.replace(hashtagPlaceholderRegex, (match, escapedIndex, unescapedIndex) => { |
||||
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex); |
||||
const topic = hashtags[index]; |
||||
if (!topic) return match; |
||||
|
||||
let url: string | undefined; |
||||
if (typeof options.hashtagUrl === 'function') { |
||||
url = options.hashtagUrl(topic); |
||||
} else if (typeof options.hashtagUrl === 'string') { |
||||
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic)); |
||||
} |
||||
|
||||
const hashtag = `#${topic}`; |
||||
if (url) { |
||||
return `<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>`; |
||||
} else { |
||||
return `<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`; |
||||
} |
||||
}); |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
function escapeHtml(text: string): string { |
||||
const map: Record<string, string> = { |
||||
'&': '&', |
||||
'<': '<', |
||||
'>': '>', |
||||
'"': '"', |
||||
"'": ''' |
||||
}; |
||||
return text.replace(/[&<>"']/g, (m) => map[m]); |
||||
} |
||||
@ -1,148 +0,0 @@
@@ -1,148 +0,0 @@
|
||||
"use strict"; |
||||
var __importDefault = (this && this.__importDefault) || function (mod) { |
||||
return (mod && mod.__esModule) ? mod : { "default": mod }; |
||||
}; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.processAsciidoc = processAsciidoc; |
||||
const core_1 = __importDefault(require("@asciidoctor/core")); |
||||
const html_utils_1 = require("./html-utils"); |
||||
const html_postprocess_1 = require("./html-postprocess"); |
||||
const asciidoctorInstance = (0, core_1.default)(); |
||||
/** |
||||
* Processes AsciiDoc content to HTML using AsciiDoctor |
||||
* Uses AsciiDoctor's built-in highlight.js and LaTeX support |
||||
*/ |
||||
async function processAsciidoc(content, options = {}) { |
||||
const { enableCodeHighlighting = true, enableLaTeX = true, enableMusicalNotation = true, } = options; |
||||
// Check if content starts with level 3+ headers
|
||||
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
|
||||
// If content starts with level 3+, use book doctype
|
||||
const firstHeaderMatch = content.match(/^(={1,6})\s+/m); |
||||
let doctype = 'article'; |
||||
if (firstHeaderMatch) { |
||||
const firstHeaderLevel = firstHeaderMatch[1].length; |
||||
if (firstHeaderLevel >= 3) { |
||||
doctype = 'book'; |
||||
} |
||||
} |
||||
try { |
||||
const result = asciidoctorInstance.convert(content, { |
||||
safe: 'safe', |
||||
backend: 'html5', |
||||
doctype: doctype, |
||||
attributes: { |
||||
'showtitle': true, |
||||
'sectanchors': true, |
||||
'sectlinks': true, |
||||
'toc': 'left', |
||||
'toclevels': 6, |
||||
'toc-title': 'Table of Contents', |
||||
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none', |
||||
'stem': enableLaTeX ? 'latexmath' : 'none', |
||||
'plantuml': 'plantuml', // Enable PlantUML diagram support
|
||||
'data-uri': true, |
||||
'imagesdir': '', |
||||
'linkcss': false, |
||||
'stylesheet': '', |
||||
'stylesdir': '', |
||||
'prewrap': true, |
||||
'sectnums': false, |
||||
'sectnumlevels': 6, |
||||
'experimental': true, |
||||
'compat-mode': false, |
||||
'attribute-missing': 'warn', |
||||
'attribute-undefined': 'warn', |
||||
'skip-front-matter': true, |
||||
'source-indent': 0, |
||||
'indent': 0, |
||||
'tabsize': 2, |
||||
'tabwidth': 2, |
||||
'hardbreaks': false, |
||||
'paragraph-rewrite': 'normal', |
||||
'sectids': true, |
||||
'idprefix': '', |
||||
'idseparator': '-', |
||||
'sectidprefix': '', |
||||
'sectidseparator': '-' |
||||
} |
||||
}); |
||||
const htmlString = typeof result === 'string' ? result : result.toString(); |
||||
// Extract table of contents from HTML
|
||||
const { toc, contentWithoutTOC } = (0, html_utils_1.extractTOC)(htmlString); |
||||
// Sanitize HTML to prevent XSS
|
||||
const sanitized = (0, html_utils_1.sanitizeHTML)(contentWithoutTOC); |
||||
// Post-process HTML: convert macros to HTML, add styling, etc.
|
||||
const processed = (0, html_postprocess_1.postProcessHtml)(sanitized, { |
||||
enableMusicalNotation, |
||||
linkBaseURL: options.linkBaseURL, |
||||
wikilinkUrl: options.wikilinkUrl, |
||||
hashtagUrl: options.hashtagUrl, |
||||
}); |
||||
// Process links: add target="_blank" to external links
|
||||
const processedWithLinks = options.linkBaseURL |
||||
? (0, html_utils_1.processLinks)(processed, options.linkBaseURL) |
||||
: processed; |
||||
// Also process TOC
|
||||
const tocSanitized = (0, html_utils_1.sanitizeHTML)(toc); |
||||
const tocProcessed = (0, html_postprocess_1.postProcessHtml)(tocSanitized, { |
||||
enableMusicalNotation: false, // Don't process music in TOC
|
||||
linkBaseURL: options.linkBaseURL, |
||||
wikilinkUrl: options.wikilinkUrl, |
||||
hashtagUrl: options.hashtagUrl, |
||||
}); |
||||
// Process links in TOC as well
|
||||
const tocProcessedWithLinks = options.linkBaseURL |
||||
? (0, html_utils_1.processLinks)(tocProcessed, options.linkBaseURL) |
||||
: tocProcessed; |
||||
// Check for LaTeX in original content (more reliable than checking HTML)
|
||||
const contentToCheck = options.originalContent || content; |
||||
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck); |
||||
// Check for musical notation in processed HTML
|
||||
const hasMusicalNotation = enableMusicalNotation && (/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed)); |
||||
return { |
||||
content: processedWithLinks, |
||||
tableOfContents: tocProcessedWithLinks, |
||||
hasLaTeX, |
||||
hasMusicalNotation, |
||||
nostrLinks: [], // Will be populated by metadata extraction
|
||||
wikilinks: [], |
||||
hashtags: [], |
||||
links: [], |
||||
media: [], |
||||
}; |
||||
} |
||||
catch (error) { |
||||
// Fallback to plain text with error logging
|
||||
const errorMessage = error instanceof Error ? error.message : String(error); |
||||
// Use process.stderr.write for Node.js compatibility instead of console.error
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const nodeProcess = globalThis.process; |
||||
if (nodeProcess?.stderr) { |
||||
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`); |
||||
} |
||||
// Escape HTML in content for safe display
|
||||
const escapedContent = (0, html_utils_1.sanitizeHTML)(content); |
||||
return { |
||||
content: `<p>${escapedContent}</p>`, |
||||
tableOfContents: '', |
||||
hasLaTeX: false, |
||||
hasMusicalNotation: false, |
||||
nostrLinks: [], |
||||
wikilinks: [], |
||||
hashtags: [], |
||||
links: [], |
||||
media: [], |
||||
}; |
||||
} |
||||
} |
||||
/** |
||||
* Check if content has LaTeX math |
||||
* Based on jumble's detection pattern |
||||
*/ |
||||
function hasMathContent(content) { |
||||
// Check for inline math: $...$ or \(...\)
|
||||
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content); |
||||
// Check for block math: $$...$$ or \[...\]
|
||||
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content); |
||||
return inlineMath || blockMath; |
||||
} |
||||
@ -1,193 +1,56 @@
@@ -1,193 +1,56 @@
|
||||
import { ProcessResult } from '../types'; |
||||
import { extractTOC, sanitizeHTML, processLinks } from './html-utils'; |
||||
import { postProcessHtml } from './html-postprocess'; |
||||
|
||||
// Lazy-load AsciiDoctor instance to avoid issues with Jest module transformation
|
||||
// Use require() for CommonJS modules to avoid Jest transformation issues
|
||||
let asciidoctorInstance: any = null; |
||||
|
||||
function getAsciidoctorInstance() { |
||||
if (!asciidoctorInstance) { |
||||
// Use require() instead of import() to avoid Jest transformation issues with Opal runtime
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
const asciidoctor = require('@asciidoctor/core'); |
||||
asciidoctorInstance = asciidoctor.default(); |
||||
} |
||||
return asciidoctorInstance; |
||||
} |
||||
|
||||
export interface ProcessOptions { |
||||
enableCodeHighlighting?: boolean; |
||||
enableLaTeX?: boolean; |
||||
enableMusicalNotation?: boolean; |
||||
originalContent?: string; // Original content for LaTeX detection
|
||||
linkBaseURL?: string; // Base URL for link processing
|
||||
wikilinkUrl?: string | ((dtag: string) => string); // Custom URL format for wikilinks
|
||||
hashtagUrl?: string | ((topic: string) => string); // Custom URL format for hashtags
|
||||
import asciidoctor from '@asciidoctor/core'; |
||||
import { ParserOptions } from '../types'; |
||||
import * as emoji from 'node-emoji'; |
||||
|
||||
export interface AsciiDocResult { |
||||
html: string; |
||||
tableOfContents: string; |
||||
hasLaTeX: boolean; |
||||
hasMusicalNotation: boolean; |
||||
} |
||||
|
||||
/** |
||||
* Processes AsciiDoc content to HTML using AsciiDoctor |
||||
* Uses AsciiDoctor's built-in highlight.js and LaTeX support |
||||
* Process AsciiDoc content to HTML |
||||
*/ |
||||
export async function processAsciidoc( |
||||
content: string, |
||||
options: ProcessOptions = {} |
||||
): Promise<ProcessResult> { |
||||
const { |
||||
enableCodeHighlighting = true, |
||||
enableLaTeX = true, |
||||
enableMusicalNotation = true, |
||||
} = options; |
||||
|
||||
// Check if content starts with level 3+ headers
|
||||
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
|
||||
// If content starts with level 3+, use book doctype
|
||||
const firstHeaderMatch = content.match(/^(={1,6})\s+/m); |
||||
let doctype: 'article' | 'book' = 'article'; |
||||
export function processAsciiDoc(content: string, options: ParserOptions): AsciiDocResult { |
||||
const hasLaTeX = /\[source,latex\]|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content); |
||||
const hasMusicalNotation = /\[abc\]|\[source,abc\]/i.test(content); |
||||
|
||||
if (firstHeaderMatch) { |
||||
const firstHeaderLevel = firstHeaderMatch[1].length; |
||||
if (firstHeaderLevel >= 3) { |
||||
doctype = 'book'; |
||||
} |
||||
} |
||||
// Process emojis before AsciiDoc conversion
|
||||
const processedContent = emoji.emojify(content); |
||||
|
||||
try { |
||||
const instance = getAsciidoctorInstance(); |
||||
const result = instance.convert(content, { |
||||
safe: 'safe', |
||||
backend: 'html5', |
||||
doctype: doctype, |
||||
const asciidoctorOptions: any = { |
||||
safe: 'unsafe', |
||||
attributes: { |
||||
'showtitle': true, |
||||
'sectanchors': true, |
||||
'sectlinks': true, |
||||
'icons': 'font', |
||||
'source-highlighter': options.enableCodeHighlighting !== false ? 'highlight.js' : undefined, |
||||
'highlightjs-theme': 'github', |
||||
'toc': 'left', |
||||
'toclevels': 6, |
||||
'toc-title': 'Table of Contents', |
||||
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none', |
||||
'stem': enableLaTeX ? 'latexmath' : 'none', |
||||
'plantuml': 'plantuml', // Enable PlantUML diagram support
|
||||
'data-uri': true, |
||||
'imagesdir': '', |
||||
'linkcss': false, |
||||
'stylesheet': '', |
||||
'stylesdir': '', |
||||
'prewrap': true, |
||||
'sectnums': false, |
||||
'sectnumlevels': 6, |
||||
'experimental': true, |
||||
'compat-mode': false, |
||||
'attribute-missing': 'warn', |
||||
'attribute-undefined': 'warn', |
||||
'skip-front-matter': true, |
||||
'source-indent': 0, |
||||
'indent': 0, |
||||
'tabsize': 2, |
||||
'tabwidth': 2, |
||||
'hardbreaks': false, |
||||
'paragraph-rewrite': 'normal', |
||||
'sectids': true, |
||||
'idprefix': '', |
||||
'idseparator': '-', |
||||
'sectidprefix': '', |
||||
'sectidseparator': '-' |
||||
'sectanchors': true, |
||||
'sectlinks': true, |
||||
'idprefix': '_', |
||||
'idseparator': '_' |
||||
} |
||||
}); |
||||
|
||||
const htmlString = typeof result === 'string' ? result : result.toString(); |
||||
|
||||
// Extract table of contents from HTML
|
||||
const { toc, contentWithoutTOC } = extractTOC(htmlString); |
||||
|
||||
// Sanitize HTML to prevent XSS
|
||||
const sanitized = sanitizeHTML(contentWithoutTOC); |
||||
|
||||
// Post-process HTML: convert macros to HTML, add styling, etc.
|
||||
const processed = postProcessHtml(sanitized, { |
||||
enableMusicalNotation, |
||||
linkBaseURL: options.linkBaseURL, |
||||
wikilinkUrl: options.wikilinkUrl, |
||||
hashtagUrl: options.hashtagUrl, |
||||
}); |
||||
|
||||
// Process links: add target="_blank" to external links
|
||||
const processedWithLinks = options.linkBaseURL
|
||||
? processLinks(processed, options.linkBaseURL) |
||||
: processed; |
||||
|
||||
// Also process TOC
|
||||
const tocSanitized = sanitizeHTML(toc); |
||||
const tocProcessed = postProcessHtml(tocSanitized, { |
||||
enableMusicalNotation: false, // Don't process music in TOC
|
||||
linkBaseURL: options.linkBaseURL, |
||||
wikilinkUrl: options.wikilinkUrl, |
||||
hashtagUrl: options.hashtagUrl, |
||||
}); |
||||
}; |
||||
|
||||
// Process links in TOC as well
|
||||
const tocProcessedWithLinks = options.linkBaseURL |
||||
? processLinks(tocProcessed, options.linkBaseURL) |
||||
: tocProcessed; |
||||
// Convert to HTML
|
||||
const Asciidoctor = asciidoctor(); |
||||
const htmlResult = Asciidoctor.convert(processedContent, asciidoctorOptions); |
||||
const html = typeof htmlResult === 'string' ? htmlResult : htmlResult.toString(); |
||||
|
||||
// Check for LaTeX in original content (more reliable than checking HTML)
|
||||
const contentToCheck = options.originalContent || content; |
||||
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck); |
||||
// Extract table of contents if present
|
||||
const tocMatch = html.match(/<div id="toc"[^>]*>([\s\S]*?)<\/div>/); |
||||
const tableOfContents = tocMatch ? tocMatch[1] : ''; |
||||
|
||||
// Check for musical notation in processed HTML
|
||||
const hasMusicalNotation = enableMusicalNotation && ( |
||||
/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed) |
||||
); |
||||
// Remove TOC from main content if present
|
||||
const contentWithoutToc = html.replace(/<div id="toc"[^>]*>[\s\S]*?<\/div>/, ''); |
||||
|
||||
return { |
||||
content: processedWithLinks, |
||||
tableOfContents: tocProcessedWithLinks, |
||||
html: contentWithoutToc, |
||||
tableOfContents, |
||||
hasLaTeX, |
||||
hasMusicalNotation, |
||||
nostrLinks: [], // Will be populated by metadata extraction
|
||||
wikilinks: [], |
||||
hashtags: [], |
||||
links: [], |
||||
media: [], |
||||
}; |
||||
} catch (error) { |
||||
// Fallback to plain text with error logging
|
||||
const errorMessage = error instanceof Error ? error.message : String(error); |
||||
// Use process.stderr.write for Node.js compatibility instead of console.error
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const nodeProcess = (globalThis as any).process; |
||||
if (nodeProcess?.stderr) { |
||||
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`); |
||||
} |
||||
|
||||
// Escape HTML in content for safe display
|
||||
const escapedContent = sanitizeHTML(content); |
||||
|
||||
return { |
||||
content: `<p>${escapedContent}</p>`, |
||||
tableOfContents: '', |
||||
hasLaTeX: false, |
||||
hasMusicalNotation: false, |
||||
nostrLinks: [], |
||||
wikilinks: [], |
||||
hashtags: [], |
||||
links: [], |
||||
media: [], |
||||
hasMusicalNotation |
||||
}; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Check if content has LaTeX math |
||||
* Based on jumble's detection pattern |
||||
*/ |
||||
function hasMathContent(content: string): boolean { |
||||
// Check for inline math: $...$ or \(...\)
|
||||
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content); |
||||
|
||||
// Check for block math: $$...$$ or \[...\]
|
||||
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content); |
||||
|
||||
return inlineMath || blockMath; |
||||
} |
||||
|
||||
@ -1,693 +0,0 @@
@@ -1,693 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.postProcessHtml = postProcessHtml; |
||||
const music_1 = require("./music"); |
||||
/** |
||||
* Post-processes HTML output from AsciiDoctor |
||||
* Converts AsciiDoc macros to HTML with data attributes and CSS classes |
||||
*/ |
||||
function postProcessHtml(html, options = {}) { |
||||
let processed = html; |
||||
// Convert bookstr markers to HTML placeholders
|
||||
processed = processed.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => { |
||||
const escaped = bookContent.replace(/"/g, '"').replace(/'/g, '''); |
||||
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`; |
||||
}); |
||||
// Convert hashtag links to HTML
|
||||
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => { |
||||
// HTML escape the display text
|
||||
const escapedDisplay = displayText |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
// If hashtagUrl is configured, make it a clickable link
|
||||
if (options.hashtagUrl) { |
||||
let url; |
||||
if (typeof options.hashtagUrl === 'function') { |
||||
url = options.hashtagUrl(normalizedHashtag); |
||||
} |
||||
else { |
||||
// String template with {topic} placeholder
|
||||
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag); |
||||
} |
||||
// Escape URL for HTML attribute
|
||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
||||
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${normalizedHashtag.replace(/"/g, '"')}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
||||
} |
||||
else { |
||||
// Default: Use span instead of <a> tag - same color as links but no underline and not clickable
|
||||
return `<span class="hashtag-link">${escapedDisplay}</span>`; |
||||
} |
||||
}); |
||||
// Convert WIKILINK:dtag|display placeholder format to HTML
|
||||
// Match WIKILINK:dtag|display, ensuring we don't match across HTML tags
|
||||
processed = processed.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => { |
||||
const escapedDtag = dTag.trim().replace(/"/g, '"'); |
||||
const escapedDisplay = displayText.trim() |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
// Generate URL using custom format or default
|
||||
let url; |
||||
if (options.wikilinkUrl) { |
||||
if (typeof options.wikilinkUrl === 'function') { |
||||
url = options.wikilinkUrl(dTag.trim()); |
||||
} |
||||
else { |
||||
// String template with {dtag} placeholder
|
||||
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim()); |
||||
} |
||||
} |
||||
else { |
||||
// Default format
|
||||
url = `/events?d=${escapedDtag}`; |
||||
} |
||||
// Escape URL for HTML attribute
|
||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
||||
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
||||
}); |
||||
// Convert any leftover link: macros that AsciiDoctor didn't convert
|
||||
// This MUST run before processOpenGraphLinks which removes "link:" prefixes
|
||||
// This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars)
|
||||
// Pattern: link:url[text] where url is http/https and text can contain any characters
|
||||
// Match link: macros that are still in the HTML as plain text (not converted by AsciiDoctor)
|
||||
// Also handle HTML-escaped versions that might appear
|
||||
processed = processed.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => { |
||||
// Unescape if already HTML-escaped (but be careful not to unescape actual content)
|
||||
let unescapedUrl = url; |
||||
// Only unescape if it looks like it was escaped (contains & or ")
|
||||
if (url.includes('&') || url.includes('"') || url.includes(''')) { |
||||
unescapedUrl = url |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
} |
||||
let unescapedText = text; |
||||
// Only unescape if it looks like it was escaped
|
||||
if (text.includes('&') || text.includes('<') || text.includes('>') || text.includes('"') || text.includes(''')) { |
||||
unescapedText = text |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
} |
||||
// Escape URL for HTML attribute (fresh escape, no double-escaping)
|
||||
const escapedUrl = unescapedUrl |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
// Escape text content for HTML (fresh escape, no double-escaping)
|
||||
const escapedText = unescapedText |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
|
||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
||||
if (isRelayUrl) { |
||||
// Simple link without OpenGraph wrapper
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
} |
||||
else { |
||||
// Regular link - will be processed by OpenGraph handler if external
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
} |
||||
}); |
||||
// Convert nostr: links to HTML
|
||||
processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => { |
||||
const nostrType = getNostrType(bech32Id); |
||||
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') { |
||||
// Render as embedded event placeholder
|
||||
const escaped = bech32Id.replace(/"/g, '"'); |
||||
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`; |
||||
} |
||||
else if (nostrType === 'npub' || nostrType === 'nprofile') { |
||||
// Render as user handle
|
||||
const escaped = bech32Id.replace(/"/g, '"'); |
||||
return `<span class="user-handle" data-pubkey="${escaped}">@${displayText}</span>`; |
||||
} |
||||
else { |
||||
// Fallback to regular link
|
||||
const escaped = bech32Id.replace(/"/g, '"'); |
||||
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${displayText}</a>`; |
||||
} |
||||
}); |
||||
// Process media URLs (YouTube, Spotify, video, audio)
|
||||
processed = processMedia(processed); |
||||
// Fix double-escaped quotes in href attributes FIRST (before any other processing)
|
||||
// This fixes href=""url"" -> href="url"
|
||||
processed = processed.replace(/href\s*=\s*["']"(https?:\/\/[^"']+)"["']/gi, (_match, url) => { |
||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
||||
return `href="${escapedUrl}"`; |
||||
}); |
||||
// Process OpenGraph links (external links that should have rich previews)
|
||||
processed = processOpenGraphLinks(processed, options.linkBaseURL); |
||||
// Process images: add max-width styling and data attributes
|
||||
processed = processImages(processed); |
||||
// Process musical notation if enabled
|
||||
if (options.enableMusicalNotation) { |
||||
processed = (0, music_1.processMusicalNotation)(processed); |
||||
} |
||||
// Clean up any escaped HTML that appears as text (e.g., <a href=...>)
|
||||
// This can happen when AsciiDoctor escapes link macros that it couldn't parse
|
||||
// Pattern: <a href="url">text</a> should be converted to actual HTML
|
||||
// Use a more flexible pattern that handles text with special characters like ://
|
||||
// Fix regular escaped HTML links
|
||||
processed = processed.replace(/<a\s+href=["'](https?:\/\/[^"']+)["']\s*>([^<]+)<\/a>/gi, (_match, url, text) => { |
||||
// Unescape the URL and text
|
||||
const unescapedUrl = url |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
const unescapedText = text |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>'); |
||||
// Re-escape properly for HTML
|
||||
const escapedUrl = unescapedUrl |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
const escapedText = unescapedText |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>'); |
||||
// Check if link text contains wss:// or ws:// - these are relay URLs
|
||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
||||
if (isRelayUrl) { |
||||
// Simple link without OpenGraph wrapper
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
} |
||||
else { |
||||
// Regular link
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
} |
||||
}); |
||||
// Clean up any leftover markdown syntax
|
||||
processed = cleanupMarkdown(processed); |
||||
// Add styling classes
|
||||
processed = addStylingClasses(processed); |
||||
// Hide raw ToC text
|
||||
processed = hideRawTocText(processed); |
||||
return processed; |
||||
} |
||||
/** |
||||
* Get Nostr identifier type |
||||
*/ |
||||
function getNostrType(id) { |
||||
if (id.startsWith('npub')) |
||||
return 'npub'; |
||||
if (id.startsWith('nprofile')) |
||||
return 'nprofile'; |
||||
if (id.startsWith('nevent')) |
||||
return 'nevent'; |
||||
if (id.startsWith('naddr')) |
||||
return 'naddr'; |
||||
if (id.startsWith('note')) |
||||
return 'note'; |
||||
return null; |
||||
} |
||||
/** |
||||
* Process media URLs (YouTube, Spotify, video, audio) |
||||
* Converts MEDIA: placeholders to HTML embeds/players |
||||
*/ |
||||
function processMedia(html) { |
||||
let processed = html; |
||||
// Process YouTube embeds
|
||||
processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => { |
||||
const escapedId = videoId.replace(/"/g, '"'); |
||||
return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
|
||||
<iframe
|
||||
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
|
||||
src="https://www.youtube.com/embed/${escapedId}"
|
||||
frameborder="0"
|
||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
|
||||
allowfullscreen |
||||
loading="lazy"> |
||||
</iframe> |
||||
</div>`; |
||||
}); |
||||
// Process Spotify embeds
|
||||
processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => { |
||||
const escapedType = type.replace(/"/g, '"'); |
||||
const escapedId = id.replace(/"/g, '"'); |
||||
return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
|
||||
<iframe
|
||||
style="border-radius: 12px; width: 100%; max-width: 100%;"
|
||||
src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
|
||||
width="100%"
|
||||
height="352"
|
||||
frameborder="0"
|
||||
allowfullscreen=""
|
||||
allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
|
||||
loading="lazy"> |
||||
</iframe> |
||||
</div>`; |
||||
}); |
||||
// Process video files
|
||||
processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
||||
const escapedUrl = url |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
return `<div class="media-embed video-embed" style="margin: 1rem 0;">
|
||||
<video
|
||||
controls
|
||||
preload="metadata"
|
||||
style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;" |
||||
class="media-player"> |
||||
<source src="${escapedUrl}" type="video/mp4"> |
||||
Your browser does not support the video tag. |
||||
</video> |
||||
</div>`; |
||||
}); |
||||
// Process audio files
|
||||
processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
||||
const escapedUrl = url |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
|
||||
<audio
|
||||
controls
|
||||
preload="metadata"
|
||||
style="width: 100%; max-width: 100%;" |
||||
class="media-player"> |
||||
<source src="${escapedUrl}"> |
||||
Your browser does not support the audio tag. |
||||
</audio> |
||||
</div>`; |
||||
}); |
||||
return processed; |
||||
} |
||||
/** |
||||
* Process OpenGraph links - mark external links for OpenGraph preview fetching |
||||
*/ |
||||
function processOpenGraphLinks(html, linkBaseURL) { |
||||
// First, clean up any corrupted HTML fragments that might interfere
|
||||
// Remove "link:" prefixes that appear before links (AsciiDoc syntax that shouldn't be in HTML)
|
||||
// This happens when AsciiDoctor doesn't fully convert link:url[text] syntax or when
|
||||
// there's literal text like "should render like link:" before an anchor tag
|
||||
let processed = html; |
||||
// Remove "link:" that appears immediately before anchor tags (most common case)
|
||||
// Match "link:" followed by optional whitespace and then <a
|
||||
processed = processed.replace(/link:\s*<a/gi, '<a'); |
||||
// Remove "link:" that appears as plain text in HTML (shouldn't be there)
|
||||
// Be careful not to match "link:" inside HTML attributes or tags
|
||||
// Match "link:" that's not inside quotes or tags
|
||||
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2'); |
||||
// Also handle cases where "link:" appears with whitespace before anchor tags
|
||||
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' '); |
||||
// Clean up any corrupted href attributes that contain HTML fragments or double-escaped quotes
|
||||
// Fix href attributes with escaped quotes: href=""url"" -> href="url"
|
||||
processed = processed.replace(/href\s*=\s*["']"(https?:\/\/[^"']+)"["']/gi, (match, url) => { |
||||
// Extract the clean URL and properly escape it
|
||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
||||
return `href="${escapedUrl}"`; |
||||
}); |
||||
// Clean up href attributes that contain HTML fragments
|
||||
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => { |
||||
// If href contains HTML tags, extract just the URL part
|
||||
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i); |
||||
if (urlMatch) { |
||||
const escapedUrl = urlMatch[1].replace(/"/g, '"').replace(/'/g, '''); |
||||
return `href="${escapedUrl}"`; |
||||
} |
||||
return match; // If we can't fix it, leave it (will be skipped by validation)
|
||||
}); |
||||
// Clean up any malformed anchor tag fragments that might cause issues
|
||||
processed = processed.replace(/<a\s+href=["']([^"'>]*<[^"'>]*)["']/gi, (match, corruptedHref) => { |
||||
// Skip corrupted anchor tags - they'll be handled by the main regex with validation
|
||||
return match; |
||||
}); |
||||
// Clean up links inside code blocks - AsciiDoctor creates them but they should be plain text
|
||||
// Remove <a> tags inside <code> blocks, keeping only the link text
|
||||
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match, content) => { |
||||
// Remove any <a> tags inside code blocks, keeping only the text content
|
||||
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1'); |
||||
return `<code>${cleaned}</code>`; |
||||
}); |
||||
// Also clean up links inside pre blocks
|
||||
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match, content) => { |
||||
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1'); |
||||
return `<pre>${cleaned}</pre>`; |
||||
}); |
||||
// Now protect code blocks and pre blocks by replacing them with placeholders
|
||||
const codeBlockPlaceholders = []; |
||||
const preBlockPlaceholders = []; |
||||
// Replace pre blocks first (they can contain code blocks)
|
||||
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => { |
||||
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`; |
||||
preBlockPlaceholders.push(match); |
||||
return placeholder; |
||||
}); |
||||
// Replace code blocks
|
||||
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => { |
||||
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`; |
||||
codeBlockPlaceholders.push(match); |
||||
return placeholder; |
||||
}); |
||||
// Extract base domain from linkBaseURL if provided
|
||||
let baseDomain = null; |
||||
if (linkBaseURL) { |
||||
try { |
||||
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
||||
if (urlMatch) { |
||||
baseDomain = urlMatch[1]; |
||||
} |
||||
} |
||||
catch { |
||||
// Ignore parsing errors
|
||||
} |
||||
} |
||||
// Before processing, remove any corrupted opengraph containers that might have been created
|
||||
// These have malformed data-og-url attributes containing HTML fragments
|
||||
// Match all spans with data-og-url and check if they're corrupted
|
||||
// Use a pattern that matches spans with data-og-url, then check the attribute value
|
||||
processed = processed.replace(/<span[^>]*data-og-url=["']([^"']+)["'][^>]*>[\s\S]*?<\/span>/gi, (match) => { |
||||
// This span has a corrupted data-og-url (contains <)
|
||||
// Extract the clean URL from the beginning of the attribute value
|
||||
const dataOgUrlMatch = match.match(/data-og-url=["']([^"']+)["']/i); |
||||
if (dataOgUrlMatch && dataOgUrlMatch[1]) { |
||||
// Extract just the URL part (everything before the first <)
|
||||
const urlMatch = dataOgUrlMatch[1].match(/(https?:\/\/[^\s<>"']+)/i); |
||||
if (urlMatch) { |
||||
const cleanUrl = urlMatch[1]; |
||||
// Extract the link text from inside the span
|
||||
const linkMatch = match.match(/<a[^>]*>(.*?)<\/a>/i); |
||||
const linkText = linkMatch ? linkMatch[1] : cleanUrl; |
||||
// Return a clean opengraph container with the fixed URL
|
||||
const escapedUrl = cleanUrl.replace(/"/g, '"').replace(/'/g, '''); |
||||
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
|
||||
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a> |
||||
<div class="opengraph-preview" data-og-loading="true" style="display: none;"> |
||||
<div class="opengraph-card"> |
||||
<div class="opengraph-image-container"> |
||||
<img class="opengraph-image" src="" alt="" style="display: none;" /> |
||||
</div> |
||||
<div class="opengraph-content"> |
||||
<div class="opengraph-site"></div> |
||||
<div class="opengraph-title"></div> |
||||
<div class="opengraph-description"></div> |
||||
</div> |
||||
</div> |
||||
</div> |
||||
</span>`; |
||||
} |
||||
// If we can't extract a clean URL, just remove the corrupted span and keep any text
|
||||
const textMatch = match.match(/>([^<]+)</); |
||||
return textMatch ? textMatch[1] : ''; |
||||
} |
||||
return match; // Keep valid spans
|
||||
}); |
||||
// Match external links (http/https) that aren't media, nostr, or wikilinks
|
||||
// Skip links that are already in media embeds or special containers
|
||||
// Use a stricter regex that only matches valid, complete anchor tags
|
||||
// The regex must match a complete <a> tag with proper structure
|
||||
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => { |
||||
// CRITICAL: Validate href FIRST - if it contains ANY HTML tags or fragments, skip immediately
|
||||
// This prevents corrupted HTML from being created
|
||||
if (!href) { |
||||
return match; // Skip if no href
|
||||
} |
||||
// Skip if href contains HTML tags or looks corrupted - be very strict
|
||||
// Check for common HTML fragments that indicate corruption
|
||||
if (href.includes('<') || href.includes('>') || href.includes('href=') || href.includes('</a>') || href.includes('<a') || href.includes('"') || href.includes("'")) { |
||||
return match; // Skip if href looks corrupted
|
||||
} |
||||
// Additional validation: href should only contain URL-safe characters
|
||||
// URLs shouldn't contain unescaped quotes or HTML tags
|
||||
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) { |
||||
return match; // Skip if href doesn't match clean URL pattern
|
||||
} |
||||
// Validate href is a proper URL (starts with http:// or https:// and doesn't contain invalid chars)
|
||||
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) { |
||||
return match; // Skip if href doesn't match URL pattern
|
||||
} |
||||
// Skip if the match contains unclosed tags or corrupted HTML
|
||||
const openATags = (match.match(/<a\s/g) || []).length; |
||||
const closeATags = (match.match(/<\/a>/g) || []).length; |
||||
if (openATags !== closeATags || openATags !== 1) { |
||||
return match; // Multiple or mismatched <a> tags = corrupted
|
||||
} |
||||
// Skip if match contains nested HTML that looks corrupted
|
||||
if (match.includes('href="') && match.split('href="').length > 2) { |
||||
return match; // Multiple href attributes = corrupted
|
||||
} |
||||
// Skip if it's already a media embed, nostr link, wikilink, or opengraph link
|
||||
if (match.includes('class="wikilink"') || |
||||
match.includes('class="nostr-link"') || |
||||
match.includes('class="opengraph-link"') || |
||||
match.includes('data-embedded-note') || |
||||
match.includes('youtube-embed') || |
||||
match.includes('spotify-embed') || |
||||
match.includes('media-embed') || |
||||
match.includes('opengraph-link-container')) { |
||||
return match; |
||||
} |
||||
// Skip if it's a media file URL
|
||||
if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) { |
||||
return match; |
||||
} |
||||
// Skip if it's YouTube or Spotify (already handled as media)
|
||||
if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) { |
||||
return match; |
||||
} |
||||
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
|
||||
// They don't need OpenGraph previews
|
||||
if (/wss?:\/\//i.test(linkText)) { |
||||
return match; |
||||
} |
||||
// Check if it's an external link (not same domain)
|
||||
let isExternal = true; |
||||
if (baseDomain) { |
||||
try { |
||||
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/); |
||||
if (hrefMatch && hrefMatch[1] === baseDomain) { |
||||
isExternal = false; |
||||
} |
||||
} |
||||
catch { |
||||
// If parsing fails, assume external
|
||||
} |
||||
} |
||||
// Only process external links
|
||||
if (!isExternal) { |
||||
return match; |
||||
} |
||||
// Escape the URL for data attribute
|
||||
const escapedUrl = href |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
// Add data attribute for OpenGraph fetching and wrap in container
|
||||
// The actual OpenGraph fetching will be done client-side via JavaScript
|
||||
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
|
||||
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a> |
||||
<div class="opengraph-preview" data-og-loading="true" style="display: none;"> |
||||
<div class="opengraph-card"> |
||||
<div class="opengraph-image-container"> |
||||
<img class="opengraph-image" src="" alt="" style="display: none;" /> |
||||
</div> |
||||
<div class="opengraph-content"> |
||||
<div class="opengraph-site"></div> |
||||
<div class="opengraph-title"></div> |
||||
<div class="opengraph-description"></div> |
||||
</div> |
||||
</div> |
||||
</div> |
||||
</span>`; |
||||
}); |
||||
// Restore code blocks
|
||||
codeBlockPlaceholders.forEach((codeBlock, index) => { |
||||
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock); |
||||
}); |
||||
// Restore pre blocks
|
||||
preBlockPlaceholders.forEach((preBlock, index) => { |
||||
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock); |
||||
}); |
||||
return processed; |
||||
} |
||||
/** |
||||
* Process images: add max-width styling and data attributes |
||||
*/ |
||||
function processImages(html) { |
||||
const imageUrls = []; |
||||
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi; |
||||
let match; |
||||
while ((match = imageUrlRegex.exec(html)) !== null) { |
||||
const url = match[1]; |
||||
if (url && !imageUrls.includes(url)) { |
||||
imageUrls.push(url); |
||||
} |
||||
} |
||||
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => { |
||||
const srcMatch = attributes.match(/src=["']([^"']+)["']/i); |
||||
if (!srcMatch) |
||||
return imgTag; |
||||
const src = srcMatch[1]; |
||||
const currentIndex = imageUrls.indexOf(src); |
||||
let updatedAttributes = attributes; |
||||
if (updatedAttributes.match(/class=["']/i)) { |
||||
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => { |
||||
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim(); |
||||
const newClasses = cleanedClasses |
||||
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in` |
||||
: 'max-w-[400px] object-contain cursor-zoom-in'; |
||||
return `class="${newClasses}"`; |
||||
}); |
||||
} |
||||
else { |
||||
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`; |
||||
} |
||||
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '"')}"`; |
||||
return `<img${updatedAttributes}>`; |
||||
}); |
||||
} |
||||
/** |
||||
* Clean URL by removing tracking parameters |
||||
* Based on jumble's cleanUrl function |
||||
*/ |
||||
function cleanUrl(url) { |
||||
try { |
||||
const parsedUrl = new URL(url); |
||||
// List of tracking parameter prefixes and exact names to remove
|
||||
const trackingParams = [ |
||||
// Google Analytics & Ads
|
||||
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', |
||||
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic', |
||||
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid', |
||||
// Facebook
|
||||
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref', |
||||
// Twitter/X
|
||||
'twclid', 'twsrc', |
||||
// Microsoft/Bing
|
||||
'msclkid', 'mc_cid', 'mc_eid', |
||||
// Adobe
|
||||
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid', |
||||
// Mailchimp
|
||||
'mc_cid', 'mc_eid', |
||||
// HubSpot
|
||||
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver', |
||||
// Marketo
|
||||
'mkt_tok', |
||||
// YouTube
|
||||
'si', 'feature', 'kw', 'pp', |
||||
// Other common tracking
|
||||
'ref', 'referrer', 'source', 'campaign', 'medium', 'content', |
||||
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd', |
||||
// Mobile app tracking
|
||||
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative', |
||||
// Amazon
|
||||
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag', |
||||
// Affiliate tracking
|
||||
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer', |
||||
// Social media share tracking
|
||||
'share', 'shared', 'sharesource' |
||||
]; |
||||
// Remove all tracking parameters
|
||||
trackingParams.forEach(param => { |
||||
parsedUrl.searchParams.delete(param); |
||||
}); |
||||
// Remove any parameter that starts with utm_ or _
|
||||
Array.from(parsedUrl.searchParams.keys()).forEach(key => { |
||||
if (key.startsWith('utm_') || key.startsWith('_')) { |
||||
parsedUrl.searchParams.delete(key); |
||||
} |
||||
}); |
||||
return parsedUrl.toString(); |
||||
} |
||||
catch { |
||||
// If URL parsing fails, return original URL
|
||||
return url; |
||||
} |
||||
} |
||||
/** |
||||
* Clean up leftover markdown syntax |
||||
*/ |
||||
function cleanupMarkdown(html) { |
||||
let cleaned = html; |
||||
// Clean up markdown image syntax
|
||||
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => { |
||||
const altText = alt || ''; |
||||
// Clean URL (remove tracking parameters)
|
||||
const cleanedUrl = cleanUrl(url); |
||||
// Escape for HTML attribute
|
||||
const escapedUrl = cleanedUrl.replace(/"/g, '"').replace(/'/g, '''); |
||||
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`; |
||||
}); |
||||
// Clean up markdown link syntax
|
||||
// Skip if the link is already inside an HTML tag or is part of escaped HTML
|
||||
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => { |
||||
// Skip if this markdown link is already inside an HTML tag
|
||||
// Check if there's an <a> tag nearby that might have been created from this
|
||||
if (cleaned.includes(`href="${url}"`) || cleaned.includes(`href='${url}'`)) { |
||||
return _match; |
||||
} |
||||
// Skip if the text contains HTML entities or looks like it's already processed
|
||||
if (text.includes('<') || text.includes('>') || text.includes('&')) { |
||||
return _match; |
||||
} |
||||
// Skip if the URL is already in an href attribute (check for escaped versions too)
|
||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
||||
if (cleaned.includes(`href="${escapedUrl}"`) || cleaned.includes(`href='${escapedUrl}'`)) { |
||||
return _match; |
||||
} |
||||
// Clean URL (remove tracking parameters)
|
||||
const cleanedUrl = cleanUrl(url); |
||||
// Escape for HTML attribute (but don't double-escape)
|
||||
const finalEscapedUrl = cleanedUrl |
||||
.replace(/&/g, '&') // Unescape if already escaped
|
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
// Escape text for HTML (but don't double-escape)
|
||||
const escapedText = text |
||||
.replace(/&/g, '&') // Unescape if already escaped
|
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>'); |
||||
return `<a href="${finalEscapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
}); |
||||
return cleaned; |
||||
} |
||||
/** |
||||
* Add proper CSS classes for styling |
||||
*/ |
||||
function addStylingClasses(html) { |
||||
let styled = html; |
||||
// Add strikethrough styling
|
||||
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>'); |
||||
// Add subscript styling
|
||||
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>'); |
||||
// Add superscript styling
|
||||
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>'); |
||||
// Add code highlighting classes
|
||||
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">'); |
||||
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">'); |
||||
return styled; |
||||
} |
||||
/** |
||||
* Hide raw AsciiDoc ToC text |
||||
*/ |
||||
function hideRawTocText(html) { |
||||
let cleaned = html; |
||||
cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, ''); |
||||
cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, ''); |
||||
cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, ''); |
||||
return cleaned; |
||||
} |
||||
@ -1,599 +0,0 @@
@@ -1,599 +0,0 @@
|
||||
import { processMusicalNotation } from './music'; |
||||
|
||||
export interface PostProcessOptions { |
||||
enableMusicalNotation?: boolean; |
||||
linkBaseURL?: string; |
||||
/** Custom URL format for wikilinks */ |
||||
wikilinkUrl?: string | ((dtag: string) => string); |
||||
/** Custom URL format for hashtags */ |
||||
hashtagUrl?: string | ((topic: string) => string); |
||||
} |
||||
|
||||
/** |
||||
* Post-processes HTML output from AsciiDoctor |
||||
*
|
||||
* Processing order (critical for correct rendering): |
||||
* 1. Convert placeholders to HTML (BOOKSTR, hashtags, wikilinks, nostr links, media, link macros) |
||||
* 2. Fix corrupted HTML (double-escaped quotes, escaped HTML as text, broken links) |
||||
* 3. Process OpenGraph links (external links with previews) |
||||
* 4. Process images (add styling) |
||||
* 5. Process musical notation |
||||
* 6. Clean up leftover markdown syntax |
||||
* 7. Add styling classes |
||||
* 8. Hide raw ToC text |
||||
*/ |
||||
export function postProcessHtml(html: string, options: PostProcessOptions = {}): string { |
||||
let processed = html; |
||||
|
||||
// ============================================
|
||||
// STEP 1: Convert placeholders to HTML
|
||||
// ============================================
|
||||
processed = convertBookstrMarkers(processed); |
||||
processed = convertHashtags(processed, options); |
||||
processed = convertWikilinks(processed, options); |
||||
processed = convertNostrLinks(processed); |
||||
processed = convertMediaPlaceholders(processed); |
||||
processed = convertLinkMacros(processed); |
||||
|
||||
// ============================================
|
||||
// STEP 2: Fix corrupted HTML
|
||||
// ============================================
|
||||
processed = fixDoubleEscapedQuotes(processed); |
||||
processed = fixEscapedHtmlLinks(processed); |
||||
processed = fixBrokenLinkPatterns(processed); |
||||
|
||||
// ============================================
|
||||
// STEP 3: Process OpenGraph links
|
||||
// ============================================
|
||||
processed = processOpenGraphLinks(processed, options.linkBaseURL); |
||||
|
||||
// ============================================
|
||||
// STEP 4: Process images
|
||||
// ============================================
|
||||
processed = processImages(processed); |
||||
|
||||
// ============================================
|
||||
// STEP 5: Process musical notation
|
||||
// ============================================
|
||||
if (options.enableMusicalNotation) { |
||||
processed = processMusicalNotation(processed); |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 6: Clean up leftover markdown
|
||||
// ============================================
|
||||
processed = cleanupMarkdown(processed); |
||||
|
||||
// ============================================
|
||||
// STEP 7: Add styling classes
|
||||
// ============================================
|
||||
processed = addStylingClasses(processed); |
||||
|
||||
// ============================================
|
||||
// STEP 8: Hide raw ToC text
|
||||
// ============================================
|
||||
processed = hideRawTocText(processed); |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 1: Convert placeholders to HTML
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Convert BOOKSTR markers to HTML placeholders |
||||
*/ |
||||
function convertBookstrMarkers(html: string): string { |
||||
return html.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => { |
||||
const escaped = escapeHtmlAttr(bookContent); |
||||
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`; |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Convert hashtag placeholders to HTML |
||||
*/ |
||||
function convertHashtags(html: string, options: PostProcessOptions): string { |
||||
return html.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => { |
||||
const escapedDisplay = escapeHtml(displayText); |
||||
|
||||
if (options.hashtagUrl) { |
||||
let url: string; |
||||
if (typeof options.hashtagUrl === 'function') { |
||||
url = options.hashtagUrl(normalizedHashtag); |
||||
} else { |
||||
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag); |
||||
} |
||||
|
||||
const escapedUrl = escapeHtmlAttr(url); |
||||
const escapedTopic = escapeHtmlAttr(normalizedHashtag); |
||||
|
||||
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${escapedTopic}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
||||
} else { |
||||
return `<span class="hashtag-link">${escapedDisplay}</span>`; |
||||
} |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Convert wikilink placeholders to HTML |
||||
*/ |
||||
function convertWikilinks(html: string, options: PostProcessOptions): string { |
||||
return html.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => { |
||||
const escapedDtag = escapeHtmlAttr(dTag.trim()); |
||||
const escapedDisplay = escapeHtml(displayText.trim()); |
||||
|
||||
let url: string; |
||||
if (options.wikilinkUrl) { |
||||
if (typeof options.wikilinkUrl === 'function') { |
||||
url = options.wikilinkUrl(dTag.trim()); |
||||
} else { |
||||
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim()); |
||||
} |
||||
} else { |
||||
url = `/events?d=${escapedDtag}`; |
||||
} |
||||
|
||||
const escapedUrl = escapeHtmlAttr(url); |
||||
|
||||
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Convert nostr: links to HTML |
||||
*/ |
||||
function convertNostrLinks(html: string): string { |
||||
return html.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => { |
||||
const nostrType = getNostrType(bech32Id); |
||||
const escaped = escapeHtmlAttr(bech32Id); |
||||
const escapedDisplay = escapeHtml(displayText); |
||||
|
||||
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') { |
||||
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`; |
||||
} else if (nostrType === 'npub' || nostrType === 'nprofile') { |
||||
return `<span class="user-handle" data-pubkey="${escaped}">@${escapedDisplay}</span>`; |
||||
} else { |
||||
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${escapedDisplay}</a>`; |
||||
} |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Get Nostr identifier type |
||||
*/ |
||||
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null { |
||||
if (id.startsWith('npub')) return 'npub'; |
||||
if (id.startsWith('nprofile')) return 'nprofile'; |
||||
if (id.startsWith('nevent')) return 'nevent'; |
||||
if (id.startsWith('naddr')) return 'naddr'; |
||||
if (id.startsWith('note')) return 'note'; |
||||
return null; |
||||
} |
||||
|
||||
/** |
||||
* Convert media placeholders to HTML embeds |
||||
*/ |
||||
function convertMediaPlaceholders(html: string): string { |
||||
let processed = html; |
||||
|
||||
// YouTube embeds
|
||||
processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => { |
||||
const escapedId = escapeHtmlAttr(videoId); |
||||
return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
|
||||
<iframe
|
||||
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
|
||||
src="https://www.youtube.com/embed/${escapedId}"
|
||||
frameborder="0"
|
||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
|
||||
allowfullscreen |
||||
loading="lazy"> |
||||
</iframe> |
||||
</div>`;
|
||||
}); |
||||
|
||||
// Spotify embeds
|
||||
processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => { |
||||
const escapedType = escapeHtmlAttr(type); |
||||
const escapedId = escapeHtmlAttr(id); |
||||
return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
|
||||
<iframe
|
||||
style="border-radius: 12px; width: 100%; max-width: 100%;"
|
||||
src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
|
||||
width="100%"
|
||||
height="352"
|
||||
frameborder="0"
|
||||
allowfullscreen=""
|
||||
allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
|
||||
loading="lazy"> |
||||
</iframe> |
||||
</div>`;
|
||||
}); |
||||
|
||||
// Video files
|
||||
processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
||||
const escapedUrl = escapeHtmlAttr(url); |
||||
return `<div class="media-embed video-embed" style="margin: 1rem 0;">
|
||||
<video
|
||||
controls
|
||||
preload="metadata"
|
||||
style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;" |
||||
class="media-player"> |
||||
<source src="${escapedUrl}" type="video/mp4"> |
||||
Your browser does not support the video tag. |
||||
</video> |
||||
</div>`;
|
||||
}); |
||||
|
||||
// Audio files
|
||||
processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
||||
const escapedUrl = escapeHtmlAttr(url); |
||||
return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
|
||||
<audio
|
||||
controls
|
||||
preload="metadata"
|
||||
style="width: 100%; max-width: 100%;" |
||||
class="media-player"> |
||||
<source src="${escapedUrl}"> |
||||
Your browser does not support the audio tag. |
||||
</audio> |
||||
</div>`;
|
||||
}); |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
/** |
||||
* Convert link: macros that AsciiDoctor didn't convert |
||||
* This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars) |
||||
*/ |
||||
function convertLinkMacros(html: string): string { |
||||
return html.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => { |
||||
// Unescape if already HTML-escaped
|
||||
const unescapedUrl = unescapeHtml(url); |
||||
const unescapedText = unescapeHtml(text); |
||||
|
||||
// Re-escape properly for HTML
|
||||
const escapedUrl = escapeHtmlAttr(unescapedUrl); |
||||
const escapedText = escapeHtml(unescapedText); |
||||
|
||||
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
|
||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
||||
|
||||
// Create link (OpenGraph processing will handle it later if needed)
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
}); |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 2: Fix corrupted HTML
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Fix double-escaped quotes in href attributes: href=""url"" -> href="url" |
||||
*/ |
||||
function fixDoubleEscapedQuotes(html: string): string { |
||||
return html.replace(/href\s*=\s*["']"(https?:\/\/[^"']+)"["']/gi, (_match, url) => { |
||||
const escapedUrl = escapeHtmlAttr(url); |
||||
return `href="${escapedUrl}"`; |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Fix escaped HTML links: <a href="...">text</a> -> <a href="...">text</a> |
||||
*/ |
||||
function fixEscapedHtmlLinks(html: string): string { |
||||
return html.replace(/<a\s+href=["'](https?:\/\/[^"']+)["']\s*>([^<]+)<\/a>/gi, (_match, url, text) => { |
||||
const unescapedUrl = unescapeHtml(url); |
||||
const unescapedText = unescapeHtml(text); |
||||
|
||||
const escapedUrl = escapeHtmlAttr(unescapedUrl); |
||||
const escapedText = escapeHtml(unescapedText); |
||||
|
||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
||||
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
}); |
||||
} |
||||
|
||||
/** |
||||
* Fix broken link patterns where attributes appear as text before escaped HTML |
||||
* Pattern: " target=...><a href=...>text</a> |
||||
*/ |
||||
function fixBrokenLinkPatterns(html: string): string { |
||||
return html.replace(/"\s+target=["'][^"']*["']\s+rel=["'][^"']*["']\s+class=["'][^"']*["']\s*><a\s+href=["'](https?:\/\/[^"']+)["']\s*>([^<]+)<\/a>/gi, (_match, url, text) => { |
||||
const unescapedUrl = unescapeHtml(url); |
||||
const unescapedText = unescapeHtml(text); |
||||
|
||||
const escapedUrl = escapeHtmlAttr(unescapedUrl); |
||||
const escapedText = escapeHtml(unescapedText); |
||||
|
||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
||||
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
}); |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 3: Process OpenGraph links
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Process OpenGraph links - mark external links for OpenGraph preview fetching |
||||
*/ |
||||
function processOpenGraphLinks(html: string, linkBaseURL?: string): string { |
||||
let processed = html; |
||||
|
||||
// Remove "link:" prefixes that might appear before anchor tags
|
||||
processed = processed.replace(/link:\s*<a/gi, '<a'); |
||||
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2'); |
||||
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' '); |
||||
|
||||
// Clean up corrupted href attributes
|
||||
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => { |
||||
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i); |
||||
if (urlMatch) { |
||||
const escapedUrl = escapeHtmlAttr(urlMatch[1]); |
||||
return `href="${escapedUrl}"`; |
||||
} |
||||
return match; |
||||
}); |
||||
|
||||
// Protect code blocks and pre blocks
|
||||
const codeBlockPlaceholders: string[] = []; |
||||
const preBlockPlaceholders: string[] = []; |
||||
|
||||
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => { |
||||
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`; |
||||
preBlockPlaceholders.push(match); |
||||
return placeholder; |
||||
}); |
||||
|
||||
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => { |
||||
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`; |
||||
codeBlockPlaceholders.push(match); |
||||
return placeholder; |
||||
}); |
||||
|
||||
// Extract base domain
|
||||
let baseDomain: string | null = null; |
||||
if (linkBaseURL) { |
||||
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
||||
if (urlMatch) { |
||||
baseDomain = urlMatch[1]; |
||||
} |
||||
} |
||||
|
||||
// Process external links
|
||||
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => { |
||||
// Validate href
|
||||
if (!href || href.includes('<') || href.includes('>') || !/^https?:\/\/[^\s<>"']+$/i.test(href)) { |
||||
return match; |
||||
} |
||||
|
||||
// Skip if already processed
|
||||
if (match.includes('class="wikilink"') ||
|
||||
match.includes('class="nostr-link"') || |
||||
match.includes('class="opengraph-link"') || |
||||
match.includes('data-embedded-note') || |
||||
match.includes('media-embed') || |
||||
match.includes('opengraph-link-container')) { |
||||
return match; |
||||
} |
||||
|
||||
// Skip media files
|
||||
if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) { |
||||
return match; |
||||
} |
||||
|
||||
// Skip YouTube/Spotify (already handled as media)
|
||||
if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) { |
||||
return match; |
||||
} |
||||
|
||||
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
|
||||
if (/wss?:\/\//i.test(linkText)) { |
||||
return match; |
||||
} |
||||
|
||||
// Check if external
|
||||
let isExternal = true; |
||||
if (baseDomain) { |
||||
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/); |
||||
if (hrefMatch && hrefMatch[1] === baseDomain) { |
||||
isExternal = false; |
||||
} |
||||
} |
||||
|
||||
if (!isExternal) { |
||||
return match; |
||||
} |
||||
|
||||
// Wrap in OpenGraph container
|
||||
const escapedUrl = escapeHtmlAttr(href); |
||||
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
|
||||
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a> |
||||
<div class="opengraph-preview" data-og-loading="true" style="display: none;"> |
||||
<div class="opengraph-card"> |
||||
<div class="opengraph-image-container"> |
||||
<img class="opengraph-image" src="" alt="" style="display: none;" /> |
||||
</div> |
||||
<div class="opengraph-content"> |
||||
<div class="opengraph-site"></div> |
||||
<div class="opengraph-title"></div> |
||||
<div class="opengraph-description"></div> |
||||
</div> |
||||
</div> |
||||
</div> |
||||
</span>`;
|
||||
}); |
||||
|
||||
// Restore code blocks
|
||||
codeBlockPlaceholders.forEach((codeBlock, index) => { |
||||
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock); |
||||
}); |
||||
|
||||
preBlockPlaceholders.forEach((preBlock, index) => { |
||||
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock); |
||||
}); |
||||
|
||||
return processed; |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 4: Process images
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Process images: add max-width styling and data attributes |
||||
*/ |
||||
function processImages(html: string): string { |
||||
const imageUrls: string[] = []; |
||||
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi; |
||||
let match; |
||||
|
||||
while ((match = imageUrlRegex.exec(html)) !== null) { |
||||
const url = match[1]; |
||||
if (url && !imageUrls.includes(url)) { |
||||
imageUrls.push(url); |
||||
} |
||||
} |
||||
|
||||
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => { |
||||
const srcMatch = attributes.match(/src=["']([^"']+)["']/i); |
||||
if (!srcMatch) return imgTag; |
||||
|
||||
const src = srcMatch[1]; |
||||
const currentIndex = imageUrls.indexOf(src); |
||||
|
||||
let updatedAttributes = attributes; |
||||
|
||||
if (updatedAttributes.match(/class=["']/i)) { |
||||
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => { |
||||
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim(); |
||||
const newClasses = cleanedClasses
|
||||
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in` |
||||
: 'max-w-[400px] object-contain cursor-zoom-in'; |
||||
return `class="${newClasses}"`; |
||||
}); |
||||
} else { |
||||
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`; |
||||
} |
||||
|
||||
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${escapeHtmlAttr(src)}"`; |
||||
|
||||
return `<img${updatedAttributes}>`; |
||||
}); |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 6: Clean up leftover markdown
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Clean up leftover markdown syntax |
||||
*/ |
||||
function cleanupMarkdown(html: string): string { |
||||
let cleaned = html; |
||||
|
||||
// Clean up markdown image syntax
|
||||
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => { |
||||
const altText = alt || ''; |
||||
const escapedUrl = escapeHtmlAttr(url); |
||||
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`; |
||||
}); |
||||
|
||||
// Clean up markdown link syntax (skip if already HTML)
|
||||
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => { |
||||
// Skip if already processed
|
||||
if (cleaned.includes(`href="${url}"`) || cleaned.includes(`href='${url}'`)) { |
||||
return _match; |
||||
} |
||||
|
||||
if (text.includes('<') || text.includes('>') || text.includes('&')) { |
||||
return _match; |
||||
} |
||||
|
||||
const escapedUrl = escapeHtmlAttr(url); |
||||
const escapedText = escapeHtml(text); |
||||
|
||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
||||
}); |
||||
|
||||
return cleaned; |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 7: Add styling classes
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Add proper CSS classes for styling |
||||
*/ |
||||
function addStylingClasses(html: string): string { |
||||
let styled = html; |
||||
|
||||
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>'); |
||||
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>'); |
||||
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>'); |
||||
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">'); |
||||
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">'); |
||||
|
||||
return styled; |
||||
} |
||||
|
||||
// ============================================
|
||||
// STEP 8: Hide raw ToC text
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Hide raw AsciiDoc ToC text |
||||
*/ |
||||
function hideRawTocText(html: string): string { |
||||
let cleaned = html; |
||||
|
||||
cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, ''); |
||||
cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, ''); |
||||
cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, ''); |
||||
|
||||
return cleaned; |
||||
} |
||||
|
||||
// ============================================
|
||||
// Utility functions
|
||||
// ============================================
|
||||
|
||||
/** |
||||
* Escape HTML content |
||||
*/ |
||||
function escapeHtml(text: string): string { |
||||
return text |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
} |
||||
|
||||
/** |
||||
* Escape HTML attribute value |
||||
*/ |
||||
function escapeHtmlAttr(text: string): string { |
||||
return text |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, '''); |
||||
} |
||||
|
||||
/** |
||||
* Unescape HTML entities |
||||
*/ |
||||
function unescapeHtml(text: string): string { |
||||
return text |
||||
.replace(/&/g, '&') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
} |
||||
@ -1,239 +0,0 @@
@@ -1,239 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.extractTOC = extractTOC; |
||||
exports.sanitizeHTML = sanitizeHTML; |
||||
exports.processLinks = processLinks; |
||||
/** |
||||
* Extracts the table of contents from AsciiDoc HTML output |
||||
* Returns the TOC HTML and the content HTML without the TOC |
||||
*/ |
||||
function extractTOC(html) { |
||||
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
|
||||
let tocContent = ''; |
||||
let contentWithoutTOC = html; |
||||
// Find the start of the TOC div - try multiple patterns
|
||||
const tocStartPatterns = [ |
||||
/<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>/i, |
||||
/<div\s+id=["']toc["'][^>]*>/i, |
||||
/<div\s+class=["']toc["'][^>]*>/i, |
||||
/<nav\s+id=["']toc["'][^>]*>/i, |
||||
]; |
||||
let tocStartIdx = -1; |
||||
let tocStartTag = ''; |
||||
for (const pattern of tocStartPatterns) { |
||||
const match = html.match(pattern); |
||||
if (match && match.index !== undefined) { |
||||
tocStartIdx = match.index; |
||||
tocStartTag = match[0]; |
||||
break; |
||||
} |
||||
} |
||||
if (tocStartIdx === -1) { |
||||
// No TOC found
|
||||
return { toc: '', contentWithoutTOC: html }; |
||||
} |
||||
// Find the matching closing tag by counting div/nav tags
|
||||
const searchStart = tocStartIdx + tocStartTag.length; |
||||
let depth = 1; |
||||
let i = searchStart; |
||||
while (i < html.length && depth > 0) { |
||||
// Look for opening or closing div/nav tags
|
||||
if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') { |
||||
// Check if it's a closing tag
|
||||
if (i + 5 < html.length && html[i + 4] === '/') { |
||||
depth--; |
||||
const closeIdx = html.indexOf('>', i); |
||||
if (closeIdx === -1) |
||||
break; |
||||
i = closeIdx + 1; |
||||
} |
||||
else { |
||||
// Opening tag - find the end (handle attributes and self-closing)
|
||||
const closeIdx = html.indexOf('>', i); |
||||
if (closeIdx === -1) |
||||
break; |
||||
// Check if it's self-closing (look for /> before the >)
|
||||
const tagContent = html.substring(i, closeIdx); |
||||
if (!tagContent.endsWith('/')) { |
||||
depth++; |
||||
} |
||||
i = closeIdx + 1; |
||||
} |
||||
} |
||||
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') { |
||||
depth--; |
||||
const closeIdx = html.indexOf('>', i); |
||||
if (closeIdx === -1) |
||||
break; |
||||
i = closeIdx + 1; |
||||
} |
||||
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') { |
||||
depth--; |
||||
const closeIdx = html.indexOf('>', i); |
||||
if (closeIdx === -1) |
||||
break; |
||||
i = closeIdx + 1; |
||||
} |
||||
else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') { |
||||
// Handle opening nav tags
|
||||
const closeIdx = html.indexOf('>', i); |
||||
if (closeIdx === -1) |
||||
break; |
||||
const tagContent = html.substring(i, closeIdx); |
||||
if (!tagContent.endsWith('/')) { |
||||
depth++; |
||||
} |
||||
i = closeIdx + 1; |
||||
} |
||||
else { |
||||
i++; |
||||
} |
||||
} |
||||
if (depth === 0) { |
||||
// Found the matching closing tag
|
||||
const tocEndIdx = i; |
||||
// Extract the TOC content (inner HTML)
|
||||
const tocFullHTML = html.substring(tocStartIdx, tocEndIdx); |
||||
// Extract just the inner content (without the outer div tags)
|
||||
let innerStart = tocStartTag.length; |
||||
let innerEnd = tocFullHTML.length; |
||||
// Find the last </div> or </nav>
|
||||
if (tocFullHTML.endsWith('</div>')) { |
||||
innerEnd -= 6; |
||||
} |
||||
else if (tocFullHTML.endsWith('</nav>')) { |
||||
innerEnd -= 7; |
||||
} |
||||
tocContent = tocFullHTML.substring(innerStart, innerEnd).trim(); |
||||
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
|
||||
tocContent = tocContent.replace(/<div\s+id=["']toctitle["'][^>]*>.*?<\/div>\s*/gis, ''); |
||||
tocContent = tocContent.trim(); |
||||
// Remove the TOC from the content
|
||||
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx); |
||||
} |
||||
// Extract just the body content if the HTML includes full document structure
|
||||
// AsciiDoctor might return full HTML with <html>, <head>, <body> tags
|
||||
// Check if this is a full HTML document
|
||||
const isFullDocument = /^\s*<!DOCTYPE|^\s*<html/i.test(contentWithoutTOC); |
||||
if (isFullDocument) { |
||||
// Extract body content using a more robust approach
|
||||
// Find the opening <body> tag
|
||||
const bodyStartMatch = contentWithoutTOC.match(/<body[^>]*>/i); |
||||
if (bodyStartMatch && bodyStartMatch.index !== undefined) { |
||||
const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length; |
||||
// Find the closing </body> tag by searching backwards from the end
|
||||
// This is more reliable than regex for nested content
|
||||
const bodyEndMatch = contentWithoutTOC.lastIndexOf('</body>'); |
||||
if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) { |
||||
contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim(); |
||||
} |
||||
} |
||||
} |
||||
// Remove any remaining document structure tags that might have slipped through
|
||||
contentWithoutTOC = contentWithoutTOC |
||||
.replace(/<html[^>]*>/gi, '') |
||||
.replace(/<\/html>/gi, '') |
||||
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '') |
||||
.replace(/<body[^>]*>/gi, '') |
||||
.replace(/<\/body>/gi, ''); |
||||
// Clean up any extra whitespace
|
||||
contentWithoutTOC = contentWithoutTOC.trim(); |
||||
return { toc: tocContent, contentWithoutTOC }; |
||||
} |
||||
/** |
||||
* Performs basic HTML sanitization to prevent XSS |
||||
*/ |
||||
function sanitizeHTML(html) { |
||||
// Remove script tags and their content
|
||||
html = html.replace(/<script[^>]*>.*?<\/script>/gis, ''); |
||||
// Remove event handlers (onclick, onerror, etc.)
|
||||
html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, ''); |
||||
// Remove javascript: protocol in links
|
||||
html = html.replace(/javascript:/gi, ''); |
||||
// Remove data: URLs that could be dangerous
|
||||
html = html.replace(/data:\s*text\/html/gi, ''); |
||||
return html; |
||||
} |
||||
/** |
||||
* Processes HTML links to add target="_blank" to external links |
||||
* This function is available for use but not currently called automatically. |
||||
* It can be used in post-processing if needed. |
||||
*/ |
||||
function processLinks(html, linkBaseURL) { |
||||
// Extract domain from linkBaseURL for comparison
|
||||
let linkBaseDomain = ''; |
||||
if (linkBaseURL) { |
||||
try { |
||||
// Use URL constructor if available (Node.js 10+)
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const URLConstructor = globalThis.URL; |
||||
if (URLConstructor) { |
||||
const url = new URLConstructor(linkBaseURL); |
||||
linkBaseDomain = url.hostname; |
||||
} |
||||
else { |
||||
throw new Error('URL not available'); |
||||
} |
||||
} |
||||
catch { |
||||
// Fallback to simple string parsing if URL constructor fails
|
||||
const url = linkBaseURL.replace(/^https?:\/\//, ''); |
||||
const parts = url.split('/'); |
||||
if (parts.length > 0) { |
||||
linkBaseDomain = parts[0]; |
||||
} |
||||
} |
||||
} |
||||
// Regex to match <a> tags with href attributes
|
||||
const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g; |
||||
return html.replace(linkRegex, (match, before, href, after) => { |
||||
// Check if it's an external link (starts with http:// or https://)
|
||||
const isExternal = href.startsWith('http://') || href.startsWith('https://'); |
||||
if (isExternal) { |
||||
// Check if it's pointing to our own domain
|
||||
if (linkBaseDomain) { |
||||
try { |
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const URLConstructor = globalThis.URL; |
||||
if (URLConstructor) { |
||||
const hrefUrl = new URLConstructor(href); |
||||
if (hrefUrl.hostname === linkBaseDomain) { |
||||
// Same domain - open in same tab (remove any existing target attribute)
|
||||
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); |
||||
} |
||||
} |
||||
else { |
||||
throw new Error('URL not available'); |
||||
} |
||||
} |
||||
catch { |
||||
// If URL parsing fails, use simple string check
|
||||
if (href.includes(linkBaseDomain)) { |
||||
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); |
||||
} |
||||
} |
||||
} |
||||
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
|
||||
if (!match.includes('target=')) { |
||||
if (!match.includes('rel=')) { |
||||
return match.replace('>', ' target="_blank" rel="noopener noreferrer">'); |
||||
} |
||||
else { |
||||
// Update existing rel attribute to include noopener if not present
|
||||
const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => { |
||||
if (!relValue.includes('noopener')) { |
||||
return `rel="${relValue} noopener noreferrer"`; |
||||
} |
||||
return relMatch; |
||||
}); |
||||
return updatedMatch.replace('>', ' target="_blank">'); |
||||
} |
||||
} |
||||
} |
||||
else { |
||||
// Local/relative link - ensure it opens in same tab (remove target if present)
|
||||
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); |
||||
} |
||||
return match; |
||||
}); |
||||
} |
||||
@ -1,164 +0,0 @@
@@ -1,164 +0,0 @@
|
||||
/** |
||||
* HTML utility functions for processing AsciiDoctor output |
||||
*
|
||||
* Functions: |
||||
* - extractTOC: Extract table of contents from HTML |
||||
* - sanitizeHTML: Sanitize HTML to prevent XSS attacks |
||||
* - processLinks: Add target="_blank" to external links |
||||
*/ |
||||
|
||||
export interface TOCResult { |
||||
toc: string; |
||||
contentWithoutTOC: string; |
||||
} |
||||
|
||||
/** |
||||
* Extract table of contents from AsciiDoctor HTML output |
||||
* AsciiDoctor generates a <div id="toc"> with class="toc" containing the TOC |
||||
*/ |
||||
export function extractTOC(html: string): TOCResult { |
||||
// Match the TOC div - AsciiDoctor generates it with id="toc" and class="toc"
|
||||
const tocMatch = html.match(/<div[^>]*id=["']toc["'][^>]*>([\s\S]*?)<\/div>/i); |
||||
|
||||
if (tocMatch) { |
||||
const toc = tocMatch[0]; // Full TOC div
|
||||
const contentWithoutTOC = html.replace(toc, '').trim(); |
||||
return { toc, contentWithoutTOC }; |
||||
} |
||||
|
||||
// Fallback: try to match by class="toc"
|
||||
const tocClassMatch = html.match(/<div[^>]*class=["'][^"']*toc[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); |
||||
|
||||
if (tocClassMatch) { |
||||
const toc = tocClassMatch[0]; |
||||
const contentWithoutTOC = html.replace(toc, '').trim(); |
||||
return { toc, contentWithoutTOC }; |
||||
} |
||||
|
||||
// No TOC found
|
||||
return { |
||||
toc: '', |
||||
contentWithoutTOC: html, |
||||
}; |
||||
} |
||||
|
||||
/** |
||||
* Sanitize HTML to prevent XSS attacks |
||||
* Removes dangerous scripts and event handlers while preserving safe HTML |
||||
*
|
||||
* This is a basic sanitizer. For production use, consider using a library like DOMPurify |
||||
*/ |
||||
export function sanitizeHTML(html: string): string { |
||||
let sanitized = html; |
||||
|
||||
// Remove script tags and their content
|
||||
sanitized = sanitized.replace(/<script[\s\S]*?<\/script>/gi, ''); |
||||
|
||||
// Remove event handlers from attributes (onclick, onerror, etc.)
|
||||
sanitized = sanitized.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, ''); |
||||
sanitized = sanitized.replace(/\s*on\w+\s*=\s*[^\s>]*/gi, ''); |
||||
|
||||
// Remove javascript: protocol in href and src attributes
|
||||
sanitized = sanitized.replace(/href\s*=\s*["']javascript:[^"']*["']/gi, 'href="#"'); |
||||
sanitized = sanitized.replace(/src\s*=\s*["']javascript:[^"']*["']/gi, 'src=""'); |
||||
|
||||
// Remove data: URLs that might contain scripts (allow images)
|
||||
// This is more permissive - you might want to be stricter
|
||||
sanitized = sanitized.replace(/src\s*=\s*["']data:text\/html[^"']*["']/gi, 'src=""'); |
||||
|
||||
// Remove iframe with dangerous sources
|
||||
sanitized = sanitized.replace(/<iframe[^>]*src\s*=\s*["']javascript:[^"']*["'][^>]*>[\s\S]*?<\/iframe>/gi, ''); |
||||
|
||||
// Remove object and embed tags (often used for XSS)
|
||||
sanitized = sanitized.replace(/<object[\s\S]*?<\/object>/gi, ''); |
||||
sanitized = sanitized.replace(/<embed[\s\S]*?>/gi, ''); |
||||
|
||||
// Remove style tags with potentially dangerous content
|
||||
// We keep style attributes but remove <style> tags
|
||||
sanitized = sanitized.replace(/<style[\s\S]*?<\/style>/gi, ''); |
||||
|
||||
// Remove link tags with javascript: or data: URLs
|
||||
sanitized = sanitized.replace(/<link[^>]*href\s*=\s*["'](javascript|data):[^"']*["'][^>]*>/gi, ''); |
||||
|
||||
// Remove meta tags with http-equiv="refresh" (can be used for redirects)
|
||||
sanitized = sanitized.replace(/<meta[^>]*http-equiv\s*=\s*["']refresh["'][^>]*>/gi, ''); |
||||
|
||||
return sanitized; |
||||
} |
||||
|
||||
/** |
||||
* Process links to add target="_blank" and rel="noreferrer noopener" to external links |
||||
*
|
||||
* External links are links that don't match the base domain. |
||||
* Internal links (same domain) are left unchanged. |
||||
*/ |
||||
export function processLinks(html: string, linkBaseURL: string): string { |
||||
if (!linkBaseURL) { |
||||
return html; |
||||
} |
||||
|
||||
// Extract base domain from linkBaseURL
|
||||
let baseDomain: string | null = null; |
||||
try { |
||||
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
||||
if (urlMatch) { |
||||
baseDomain = urlMatch[1]; |
||||
} |
||||
} catch { |
||||
// If parsing fails, don't process links
|
||||
return html; |
||||
} |
||||
|
||||
if (!baseDomain) { |
||||
return html; |
||||
} |
||||
|
||||
// Process anchor tags with href attributes
|
||||
return html.replace(/<a\s+([^>]*\s+)?href\s*=\s*["']([^"']+)["']([^>]*?)>/gi, (match, before, href, after) => { |
||||
// Skip if already has target attribute
|
||||
if (match.includes('target=')) { |
||||
return match; |
||||
} |
||||
|
||||
// Skip if it's not an http/https link
|
||||
if (!/^https?:\/\//i.test(href)) { |
||||
return match; |
||||
} |
||||
|
||||
// Skip if it's already a special link type (nostr, wikilink, etc.)
|
||||
if (match.includes('class="nostr-link"') || |
||||
match.includes('class="wikilink"') || |
||||
match.includes('class="hashtag-link"')) { |
||||
return match; |
||||
} |
||||
|
||||
// Check if it's an external link
|
||||
let isExternal = true; |
||||
try { |
||||
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/); |
||||
if (hrefMatch && hrefMatch[1] === baseDomain) { |
||||
isExternal = false; |
||||
} |
||||
} catch { |
||||
// If parsing fails, assume external
|
||||
} |
||||
|
||||
// Only add target="_blank" to external links
|
||||
if (isExternal) { |
||||
// Check if there's already a rel attribute
|
||||
if (match.includes('rel=')) { |
||||
// Add to existing rel attribute if it doesn't already have noreferrer noopener
|
||||
if (!match.includes('noreferrer') && !match.includes('noopener')) { |
||||
return match.replace(/rel\s*=\s*["']([^"']+)["']/i, 'rel="$1 noreferrer noopener"'); |
||||
} |
||||
// Add target="_blank" before the closing >
|
||||
return match.replace(/>$/, ' target="_blank">'); |
||||
} else { |
||||
// Add both target and rel
|
||||
return match.replace(/>$/, ' target="_blank" rel="noreferrer noopener">'); |
||||
} |
||||
} |
||||
|
||||
return match; |
||||
}); |
||||
} |
||||
@ -0,0 +1,93 @@
@@ -0,0 +1,93 @@
|
||||
import { marked } from 'marked'; |
||||
// @ts-ignore - marked is ESM but we need it to work in Jest
|
||||
import { ParserOptions } from '../types'; |
||||
import * as emoji from 'node-emoji'; |
||||
|
||||
export interface MarkdownResult { |
||||
html: string; |
||||
frontmatter?: Record<string, any>; |
||||
hasLaTeX: boolean; |
||||
hasMusicalNotation: boolean; |
||||
} |
||||
|
||||
/** |
||||
* Extract YAML frontmatter from markdown content |
||||
*/ |
||||
function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } { |
||||
const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/; |
||||
const match = content.match(frontmatterRegex); |
||||
|
||||
if (!match) { |
||||
return { content }; |
||||
} |
||||
|
||||
try { |
||||
// Simple YAML parser for basic key-value pairs
|
||||
const yamlContent = match[1]; |
||||
const frontmatter: Record<string, any> = {}; |
||||
const lines = yamlContent.split('\n'); |
||||
|
||||
for (const line of lines) { |
||||
const trimmed = line.trim(); |
||||
if (!trimmed || trimmed.startsWith('#')) continue; |
||||
|
||||
const colonIndex = trimmed.indexOf(':'); |
||||
if (colonIndex === -1) continue; |
||||
|
||||
const key = trimmed.substring(0, colonIndex).trim(); |
||||
let value = trimmed.substring(colonIndex + 1).trim(); |
||||
|
||||
// Remove quotes if present
|
||||
if ((value.startsWith('"') && value.endsWith('"')) ||
|
||||
(value.startsWith("'") && value.endsWith("'"))) { |
||||
value = value.slice(1, -1); |
||||
} |
||||
|
||||
// Handle arrays (simple case)
|
||||
if (value.startsWith('[') && value.endsWith(']')) { |
||||
const arrayContent = value.slice(1, -1); |
||||
frontmatter[key] = arrayContent.split(',').map(v => v.trim().replace(/^["']|["']$/g, '')); |
||||
} else { |
||||
frontmatter[key] = value; |
||||
} |
||||
} |
||||
|
||||
return { |
||||
frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, |
||||
content: content.substring(match[0].length) |
||||
}; |
||||
} catch (e) { |
||||
return { content }; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Process Markdown content to HTML (minimal markdown support) |
||||
*/ |
||||
export function processMarkdown(content: string, options: ParserOptions): MarkdownResult { |
||||
// Extract frontmatter
|
||||
const { frontmatter, content: contentWithoutFrontmatter } = extractFrontmatter(content); |
||||
|
||||
// Detect LaTeX and musical notation
|
||||
const hasLaTeX = /```latex|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content); |
||||
const hasMusicalNotation = /```abc|```music/i.test(content); |
||||
|
||||
// Configure marked for minimal markdown
|
||||
marked.setOptions({ |
||||
gfm: true, |
||||
breaks: false |
||||
}); |
||||
|
||||
// Process emoji shortcodes before markdown processing
|
||||
let processedContent = emoji.emojify(contentWithoutFrontmatter); |
||||
|
||||
// Convert markdown to HTML
|
||||
const html = marked.parse(processedContent) as string; |
||||
|
||||
return { |
||||
html, |
||||
frontmatter, |
||||
hasLaTeX, |
||||
hasMusicalNotation |
||||
}; |
||||
} |
||||
@ -1,143 +0,0 @@
@@ -1,143 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.processMusicalNotation = processMusicalNotation; |
||||
/** |
||||
* Processes musical notation in HTML content |
||||
* Wraps musical notation in appropriate HTML for rendering |
||||
*/ |
||||
function processMusicalNotation(html) { |
||||
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
|
||||
// These were created by a buggy regex that matched the entire HTML document
|
||||
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => { |
||||
// This is corrupted - extract just the ABC notation from the beginning
|
||||
let decoded = dataAbc |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
// Find the actual ABC notation (starts with X:)
|
||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|<|<\/|sect|div|pre|code)/); |
||||
if (abcMatch) { |
||||
const cleanAbc = abcMatch[1].trim(); |
||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`; |
||||
} |
||||
// If we can't extract clean ABC, remove the div entirely
|
||||
return content; |
||||
}); |
||||
// Clean up code blocks that contain corrupted abc-notation divs inside them
|
||||
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
|
||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
||||
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
|
||||
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i); |
||||
if (longDataAbcMatch) { |
||||
// Extract just the ABC notation from the beginning of the corrupted data-abc value
|
||||
let decoded = longDataAbcMatch[1] |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
// The ABC notation ends where the HTML document starts (</code> or </pre>)
|
||||
// Extract everything from X: up to (but not including) </code> or </pre>
|
||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=<\/code>|<\/pre>)/); |
||||
if (abcMatch) { |
||||
let cleanAbc = abcMatch[1].trim(); |
||||
// Remove any trailing HTML entities
|
||||
cleanAbc = cleanAbc.replace(/<.*$/, '').trim(); |
||||
// Validate it's reasonable ABC notation
|
||||
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) { |
||||
// Return clean code block - the processing step will wrap it in abc-notation div
|
||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`; |
||||
} |
||||
} |
||||
// If extraction fails, just remove the corrupted div and return empty code block
|
||||
// This prevents the corrupted data from being rendered
|
||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`; |
||||
} |
||||
return match; |
||||
}); |
||||
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
|
||||
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
|
||||
// We do NOT auto-detect ABC notation - it must be explicitly marked
|
||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
||||
// Skip if already processed or corrupted
|
||||
if (codeContent.includes('abc-notation') || |
||||
codeContent.includes('class="abc-notation"') || |
||||
codeContent.includes('<div') || |
||||
codeContent.includes('</div>') || |
||||
codeContent.length > 5000) { |
||||
return match; |
||||
} |
||||
// Extract ABC content from the code block
|
||||
let abcContent = codeContent |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'") |
||||
.replace(/'/g, "'") |
||||
.replace(///g, '/'); |
||||
// Remove any HTML tags
|
||||
abcContent = abcContent.replace(/<[^>]+>/g, '').trim(); |
||||
// Only process if it looks like valid ABC notation (starts with X:)
|
||||
// Since this is explicitly marked as ABC, we trust it's ABC notation
|
||||
if (abcContent.match(/^X:\s*\d+/m) && |
||||
abcContent.length < 3000 && |
||||
!abcContent.includes('</') && |
||||
!abcContent.includes('<div') && |
||||
!abcContent.includes('sect') && |
||||
!abcContent.includes('class=')) { |
||||
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
|
||||
const lines = abcContent.split('\n'); |
||||
const abcLines = []; |
||||
for (const line of lines) { |
||||
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) { |
||||
break; |
||||
} |
||||
if (line.length > 200) { |
||||
break; |
||||
} |
||||
abcLines.push(line); |
||||
if (abcLines.join('\n').length > 2000) { |
||||
break; |
||||
} |
||||
} |
||||
const cleanAbc = abcLines.join('\n').trim(); |
||||
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) { |
||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`; |
||||
} |
||||
} |
||||
return match; |
||||
}); |
||||
// Process LilyPond notation blocks
|
||||
const lilypondPattern = /(\\relative[^}]+})/gs; |
||||
html = html.replace(lilypondPattern, (match) => { |
||||
const lilypondContent = match.trim(); |
||||
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`; |
||||
}); |
||||
// Process inline chord notation: [C], [Am], [F#m7], etc.
|
||||
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g; |
||||
html = html.replace(chordPattern, (match, chord) => { |
||||
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`; |
||||
}); |
||||
// Process MusicXML-like notation
|
||||
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs; |
||||
html = html.replace(musicxmlPattern, (match) => { |
||||
const musicxmlContent = match.trim(); |
||||
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`; |
||||
}); |
||||
return html; |
||||
} |
||||
/** |
||||
* Escapes a string for use in HTML attributes |
||||
*/ |
||||
function escapeForAttr(text) { |
||||
return text |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, ''') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/\n/g, ' ') |
||||
.replace(/\r/g, ''); |
||||
} |
||||
@ -1,152 +0,0 @@
@@ -1,152 +0,0 @@
|
||||
/** |
||||
* Processes musical notation in HTML content |
||||
* Wraps musical notation in appropriate HTML for rendering |
||||
*/ |
||||
export function processMusicalNotation(html: string): string { |
||||
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
|
||||
// These were created by a buggy regex that matched the entire HTML document
|
||||
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => { |
||||
// This is corrupted - extract just the ABC notation from the beginning
|
||||
let decoded = dataAbc |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
|
||||
// Find the actual ABC notation (starts with X:)
|
||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|<|<\/|sect|div|pre|code)/); |
||||
if (abcMatch) { |
||||
const cleanAbc = abcMatch[1].trim(); |
||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`; |
||||
} |
||||
// If we can't extract clean ABC, remove the div entirely
|
||||
return content; |
||||
}); |
||||
|
||||
// Clean up code blocks that contain corrupted abc-notation divs inside them
|
||||
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
|
||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
||||
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
|
||||
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i); |
||||
if (longDataAbcMatch) { |
||||
// Extract just the ABC notation from the beginning of the corrupted data-abc value
|
||||
let decoded = longDataAbcMatch[1] |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'"); |
||||
|
||||
// The ABC notation ends where the HTML document starts (</code> or </pre>)
|
||||
// Extract everything from X: up to (but not including) </code> or </pre>
|
||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=<\/code>|<\/pre>)/); |
||||
if (abcMatch) { |
||||
let cleanAbc = abcMatch[1].trim(); |
||||
// Remove any trailing HTML entities
|
||||
cleanAbc = cleanAbc.replace(/<.*$/, '').trim(); |
||||
// Validate it's reasonable ABC notation
|
||||
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) { |
||||
// Return clean code block - the processing step will wrap it in abc-notation div
|
||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`; |
||||
} |
||||
} |
||||
// If extraction fails, just remove the corrupted div and return empty code block
|
||||
// This prevents the corrupted data from being rendered
|
||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`; |
||||
} |
||||
return match; |
||||
}); |
||||
|
||||
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
|
||||
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
|
||||
// We do NOT auto-detect ABC notation - it must be explicitly marked
|
||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
||||
// Skip if already processed or corrupted
|
||||
if (codeContent.includes('abc-notation') ||
|
||||
codeContent.includes('class="abc-notation"') || |
||||
codeContent.includes('<div') || |
||||
codeContent.includes('</div>') || |
||||
codeContent.length > 5000) { |
||||
return match; |
||||
} |
||||
|
||||
// Extract ABC content from the code block
|
||||
let abcContent = codeContent |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/&/g, '&') |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, "'") |
||||
.replace(/'/g, "'") |
||||
.replace(///g, '/'); |
||||
|
||||
// Remove any HTML tags
|
||||
abcContent = abcContent.replace(/<[^>]+>/g, '').trim(); |
||||
|
||||
// Only process if it looks like valid ABC notation (starts with X:)
|
||||
// Since this is explicitly marked as ABC, we trust it's ABC notation
|
||||
if (abcContent.match(/^X:\s*\d+/m) &&
|
||||
abcContent.length < 3000 && |
||||
!abcContent.includes('</') && |
||||
!abcContent.includes('<div') && |
||||
!abcContent.includes('sect') && |
||||
!abcContent.includes('class=')) { |
||||
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
|
||||
const lines = abcContent.split('\n'); |
||||
const abcLines: string[] = []; |
||||
for (const line of lines) { |
||||
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) { |
||||
break; |
||||
} |
||||
if (line.length > 200) { |
||||
break; |
||||
} |
||||
abcLines.push(line); |
||||
if (abcLines.join('\n').length > 2000) { |
||||
break; |
||||
} |
||||
} |
||||
const cleanAbc = abcLines.join('\n').trim(); |
||||
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) { |
||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`; |
||||
} |
||||
} |
||||
return match; |
||||
}); |
||||
|
||||
// Process LilyPond notation blocks
|
||||
const lilypondPattern = /(\\relative[^}]+})/gs; |
||||
html = html.replace(lilypondPattern, (match) => { |
||||
const lilypondContent = match.trim(); |
||||
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`; |
||||
}); |
||||
|
||||
// Process inline chord notation: [C], [Am], [F#m7], etc.
|
||||
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g; |
||||
html = html.replace(chordPattern, (match, chord) => { |
||||
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`; |
||||
}); |
||||
|
||||
// Process MusicXML-like notation
|
||||
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs; |
||||
html = html.replace(musicxmlPattern, (match) => { |
||||
const musicxmlContent = match.trim(); |
||||
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`; |
||||
}); |
||||
|
||||
return html; |
||||
} |
||||
|
||||
/** |
||||
* Escapes a string for use in HTML attributes |
||||
*/ |
||||
function escapeForAttr(text: string): string { |
||||
return text |
||||
.replace(/"/g, '"') |
||||
.replace(/'/g, ''') |
||||
.replace(/</g, '<') |
||||
.replace(/>/g, '>') |
||||
.replace(/\n/g, ' ') |
||||
.replace(/\r/g, ''); |
||||
} |
||||
@ -1,14 +0,0 @@
@@ -1,14 +0,0 @@
|
||||
"use strict"; |
||||
Object.defineProperty(exports, "__esModule", { value: true }); |
||||
exports.ContentFormat = void 0; |
||||
/** |
||||
* Detected content format |
||||
*/ |
||||
var ContentFormat; |
||||
(function (ContentFormat) { |
||||
ContentFormat["Unknown"] = "unknown"; |
||||
ContentFormat["AsciiDoc"] = "asciidoc"; |
||||
ContentFormat["Markdown"] = "markdown"; |
||||
ContentFormat["Wikipedia"] = "wikipedia"; |
||||
ContentFormat["Plain"] = "plain"; |
||||
})(ContentFormat || (exports.ContentFormat = ContentFormat = {})); |
||||
@ -1,20 +0,0 @@
@@ -1,20 +0,0 @@
|
||||
/** |
||||
* Type declarations for @asciidoctor/core |
||||
* These are minimal types - the actual types should come from the package |
||||
*/ |
||||
declare module '@asciidoctor/core' { |
||||
interface ConvertOptions { |
||||
safe?: string; |
||||
backend?: string; |
||||
doctype?: string; |
||||
attributes?: Record<string, any>; |
||||
extension_registry?: any; |
||||
} |
||||
|
||||
interface Asciidoctor { |
||||
convert(content: string, options?: ConvertOptions): string | any; |
||||
} |
||||
|
||||
function asciidoctor(): Asciidoctor; |
||||
export default asciidoctor; |
||||
} |
||||
@ -1,732 +0,0 @@
@@ -1,732 +0,0 @@
|
||||
import { Parser } from '../parser'; |
||||
import * as fs from 'fs'; |
||||
import * as path from 'path'; |
||||
import { ProcessResult } from '../types'; |
||||
|
||||
/** |
||||
* Shared utilities for generating test reports |
||||
*/ |
||||
|
||||
export interface TestData { |
||||
original: string; |
||||
result: ProcessResult; |
||||
} |
||||
|
||||
export interface ReportData { |
||||
markdown: TestData; |
||||
asciidoc: TestData; |
||||
} |
||||
|
||||
/** |
||||
* Generate HTML test report from parsed documents |
||||
*/ |
||||
export function generateHTMLReport(data: ReportData): string { |
||||
const { markdown, asciidoc } = data; |
||||
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en"> |
||||
<head> |
||||
<meta charset="UTF-8"> |
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
||||
<title>GC Parser Test Report</title> |
||||
<style> |
||||
* { |
||||
margin: 0; |
||||
padding: 0; |
||||
box-sizing: border-box; |
||||
} |
||||
|
||||
body { |
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; |
||||
line-height: 1.6; |
||||
color: #333; |
||||
background: #f5f5f5; |
||||
padding: 20px; |
||||
} |
||||
|
||||
.container { |
||||
max-width: 1400px; |
||||
margin: 0 auto; |
||||
} |
||||
|
||||
h1 { |
||||
color: #2c3e50; |
||||
margin-bottom: 10px; |
||||
font-size: 2.5em; |
||||
} |
||||
|
||||
.subtitle { |
||||
color: #7f8c8d; |
||||
margin-bottom: 30px; |
||||
font-size: 1.1em; |
||||
} |
||||
|
||||
.section { |
||||
background: white; |
||||
border-radius: 8px; |
||||
padding: 30px; |
||||
margin-bottom: 30px; |
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
||||
} |
||||
|
||||
.section h2 { |
||||
color: #34495e; |
||||
margin-bottom: 20px; |
||||
padding-bottom: 10px; |
||||
border-bottom: 2px solid #3498db; |
||||
font-size: 1.8em; |
||||
} |
||||
|
||||
.section h3 { |
||||
color: #2c3e50; |
||||
margin-top: 25px; |
||||
margin-bottom: 15px; |
||||
font-size: 1.3em; |
||||
} |
||||
|
||||
.tabs { |
||||
display: flex; |
||||
gap: 10px; |
||||
margin-bottom: 20px; |
||||
border-bottom: 2px solid #e0e0e0; |
||||
} |
||||
|
||||
.tab { |
||||
padding: 12px 24px; |
||||
background: #f8f9fa; |
||||
border: none; |
||||
border-top-left-radius: 6px; |
||||
border-top-right-radius: 6px; |
||||
cursor: pointer; |
||||
font-size: 1em; |
||||
font-weight: 500; |
||||
color: #555; |
||||
transition: all 0.2s; |
||||
} |
||||
|
||||
.tab:hover { |
||||
background: #e9ecef; |
||||
} |
||||
|
||||
.tab.active { |
||||
background: #3498db; |
||||
color: white; |
||||
} |
||||
|
||||
.tab-content { |
||||
display: none; |
||||
} |
||||
|
||||
.tab-content.active { |
||||
display: block; |
||||
} |
||||
|
||||
.metadata-grid { |
||||
display: grid; |
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); |
||||
gap: 15px; |
||||
margin-top: 15px; |
||||
} |
||||
|
||||
.metadata-item { |
||||
background: #f8f9fa; |
||||
padding: 12px; |
||||
border-radius: 4px; |
||||
border-left: 3px solid #3498db; |
||||
} |
||||
|
||||
.metadata-item strong { |
||||
color: #2c3e50; |
||||
display: block; |
||||
margin-bottom: 5px; |
||||
} |
||||
|
||||
.metadata-item code { |
||||
background: #e9ecef; |
||||
padding: 2px 6px; |
||||
border-radius: 3px; |
||||
font-size: 0.9em; |
||||
} |
||||
|
||||
.code-block { |
||||
background: #2d2d2d; |
||||
color: #f8f8f2; |
||||
padding: 15px; |
||||
border-radius: 6px; |
||||
overflow-x: auto; |
||||
font-family: 'Courier New', monospace; |
||||
font-size: 0.9em; |
||||
line-height: 1.5; |
||||
margin: 15px 0; |
||||
max-height: 400px; |
||||
overflow-y: auto; |
||||
} |
||||
|
||||
.code-block pre { |
||||
margin: 0; |
||||
white-space: pre-wrap; |
||||
word-wrap: break-word; |
||||
} |
||||
|
||||
.rendered-output { |
||||
background: white; |
||||
border: 1px solid #ddd; |
||||
padding: 20px; |
||||
border-radius: 6px; |
||||
margin: 15px 0; |
||||
min-height: 200px; |
||||
} |
||||
|
||||
.rendered-output * { |
||||
max-width: 100%; |
||||
} |
||||
|
||||
.stats { |
||||
display: grid; |
||||
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); |
||||
gap: 15px; |
||||
margin-top: 20px; |
||||
} |
||||
|
||||
.stat-card { |
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
||||
color: white; |
||||
padding: 20px; |
||||
border-radius: 8px; |
||||
text-align: center; |
||||
} |
||||
|
||||
.stat-card .number { |
||||
font-size: 2.5em; |
||||
font-weight: bold; |
||||
margin-bottom: 5px; |
||||
} |
||||
|
||||
.stat-card .label { |
||||
font-size: 0.9em; |
||||
opacity: 0.9; |
||||
} |
||||
|
||||
.list-item { |
||||
background: #f8f9fa; |
||||
padding: 8px 12px; |
||||
margin: 5px 0; |
||||
border-radius: 4px; |
||||
border-left: 3px solid #95a5a6; |
||||
} |
||||
|
||||
.list-item code { |
||||
background: #e9ecef; |
||||
padding: 2px 6px; |
||||
border-radius: 3px; |
||||
font-size: 0.85em; |
||||
} |
||||
|
||||
.success-badge { |
||||
display: inline-block; |
||||
background: #27ae60; |
||||
color: white; |
||||
padding: 4px 12px; |
||||
border-radius: 12px; |
||||
font-size: 0.85em; |
||||
font-weight: 500; |
||||
margin-left: 10px; |
||||
} |
||||
|
||||
.warning-badge { |
||||
display: inline-block; |
||||
background: #f39c12; |
||||
color: white; |
||||
padding: 4px 12px; |
||||
border-radius: 12px; |
||||
font-size: 0.85em; |
||||
font-weight: 500; |
||||
margin-left: 10px; |
||||
} |
||||
|
||||
.comparison { |
||||
display: grid; |
||||
grid-template-columns: 1fr 1fr; |
||||
gap: 20px; |
||||
margin-top: 20px; |
||||
} |
||||
|
||||
@media (max-width: 768px) { |
||||
.comparison { |
||||
grid-template-columns: 1fr; |
||||
} |
||||
} |
||||
|
||||
.json-view { |
||||
background: #f8f9fa; |
||||
padding: 15px; |
||||
border-radius: 6px; |
||||
overflow-x: auto; |
||||
font-family: 'Courier New', monospace; |
||||
font-size: 0.85em; |
||||
max-height: 300px; |
||||
overflow-y: auto; |
||||
} |
||||
</style> |
||||
</head> |
||||
<body> |
||||
<div class="container"> |
||||
<h1>GC Parser Test Report</h1> |
||||
<p class="subtitle">Generated: ${new Date().toLocaleString()}</p> |
||||
|
||||
<!-- Markdown Section --> |
||||
<div class="section"> |
||||
<h2>Markdown Document Test <span class="success-badge">✓ Parsed</span></h2> |
||||
|
||||
<div class="tabs"> |
||||
<button class="tab active" onclick="showTab('md-overview')">Overview</button> |
||||
<button class="tab" onclick="showTab('md-original')">Original Content</button> |
||||
<button class="tab" onclick="showTab('md-rendered')">Rendered Output</button> |
||||
<button class="tab" onclick="showTab('md-metadata')">Metadata</button> |
||||
</div> |
||||
|
||||
<div id="md-overview" class="tab-content active"> |
||||
<div class="stats"> |
||||
<div class="stat-card"> |
||||
<div class="number">${markdown.result.nostrLinks.length}</div> |
||||
<div class="label">Nostr Links</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${markdown.result.wikilinks.length}</div> |
||||
<div class="label">Wikilinks</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${markdown.result.hashtags.length}</div> |
||||
<div class="label">Hashtags</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${markdown.result.links.length}</div> |
||||
<div class="label">Links</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${markdown.result.media.length}</div> |
||||
<div class="label">Media URLs</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${markdown.result.hasLaTeX ? 'Yes' : 'No'}</div> |
||||
<div class="label">Has LaTeX</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${markdown.result.hasMusicalNotation ? 'Yes' : 'No'}</div> |
||||
<div class="label">Has Music</div> |
||||
</div> |
||||
</div> |
||||
|
||||
<h3>Frontmatter</h3> |
||||
${markdown.result.frontmatter ? ` |
||||
<div class="metadata-grid"> |
||||
${Object.entries(markdown.result.frontmatter).map(([key, value]) => ` |
||||
<div class="metadata-item"> |
||||
<strong>${escapeHtml(key)}</strong> |
||||
<code>${escapeHtml(JSON.stringify(value))}</code> |
||||
</div> |
||||
`).join('')}
|
||||
</div> |
||||
` : '<p><em>No frontmatter found</em></p>'}
|
||||
</div> |
||||
|
||||
<div id="md-original" class="tab-content"> |
||||
<h3>Original Markdown Content</h3> |
||||
<div class="code-block"> |
||||
<pre>${escapeHtml(markdown.original)}</pre> |
||||
</div> |
||||
</div> |
||||
|
||||
<div id="md-rendered" class="tab-content"> |
||||
<h3>Rendered HTML Output</h3> |
||||
<div class="rendered-output"> |
||||
${cleanHtmlContent(markdown.result.content)} |
||||
</div> |
||||
<details style="margin-top: 15px;"> |
||||
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML (Final Processed State)</summary> |
||||
<div class="code-block" style="margin-top: 10px;"> |
||||
<pre>${escapeHtml(markdown.result.content)}</pre> |
||||
</div> |
||||
</details> |
||||
</div> |
||||
|
||||
<div id="md-metadata" class="tab-content"> |
||||
<h3>Extracted Metadata</h3> |
||||
|
||||
${markdown.result.nostrLinks.length > 0 ? ` |
||||
<h4>Nostr Links (${markdown.result.nostrLinks.length})</h4> |
||||
${markdown.result.nostrLinks.map((link: any) => ` |
||||
<div class="list-item"> |
||||
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code> |
||||
${link.text ? ` - ${escapeHtml(link.text)}` : ''} |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${markdown.result.wikilinks.length > 0 ? ` |
||||
<h4>Wikilinks (${markdown.result.wikilinks.length})</h4> |
||||
${markdown.result.wikilinks.map((wl: any) => ` |
||||
<div class="list-item"> |
||||
<code>${escapeHtml(wl.original)}</code> → dtag: <code>${escapeHtml(wl.dtag)}</code> |
||||
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''} |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${markdown.result.hashtags.length > 0 ? ` |
||||
<h4>Hashtags (${markdown.result.hashtags.length})</h4> |
||||
${markdown.result.hashtags.map((tag: string) => ` |
||||
<div class="list-item"> |
||||
<code>#${escapeHtml(tag)}</code> |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${markdown.result.links.length > 0 ? ` |
||||
<h4>Links (${markdown.result.links.length})</h4> |
||||
${markdown.result.links.map((link: any) => ` |
||||
<div class="list-item"> |
||||
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a> |
||||
${link.isExternal ? '<span class="warning-badge">External</span>' : ''} |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${markdown.result.media.length > 0 ? ` |
||||
<h4>Media URLs (${markdown.result.media.length})</h4> |
||||
${markdown.result.media.map((url: string) => ` |
||||
<div class="list-item"> |
||||
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a> |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${markdown.result.tableOfContents ? ` |
||||
<h4>Table of Contents</h4> |
||||
<div class="rendered-output"> |
||||
${markdown.result.tableOfContents} |
||||
</div> |
||||
` : ''}
|
||||
</div> |
||||
</div> |
||||
|
||||
<!-- AsciiDoc Section --> |
||||
<div class="section"> |
||||
<h2>AsciiDoc Document Test <span class="success-badge">✓ Parsed</span></h2> |
||||
|
||||
<div class="tabs"> |
||||
<button class="tab active" onclick="showTab('ad-overview')">Overview</button> |
||||
<button class="tab" onclick="showTab('ad-original')">Original Content</button> |
||||
<button class="tab" onclick="showTab('ad-rendered')">Rendered Output</button> |
||||
<button class="tab" onclick="showTab('ad-metadata')">Metadata</button> |
||||
</div> |
||||
|
||||
<div id="ad-overview" class="tab-content active"> |
||||
<div class="stats"> |
||||
<div class="stat-card"> |
||||
<div class="number">${asciidoc.result.nostrLinks.length}</div> |
||||
<div class="label">Nostr Links</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${asciidoc.result.wikilinks.length}</div> |
||||
<div class="label">Wikilinks</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${asciidoc.result.hashtags.length}</div> |
||||
<div class="label">Hashtags</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${asciidoc.result.links.length}</div> |
||||
<div class="label">Links</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${asciidoc.result.media.length}</div> |
||||
<div class="label">Media URLs</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${asciidoc.result.hasLaTeX ? 'Yes' : 'No'}</div> |
||||
<div class="label">Has LaTeX</div> |
||||
</div> |
||||
<div class="stat-card"> |
||||
<div class="number">${asciidoc.result.hasMusicalNotation ? 'Yes' : 'No'}</div> |
||||
<div class="label">Has Music</div> |
||||
</div> |
||||
</div> |
||||
|
||||
<h3>Frontmatter</h3> |
||||
${asciidoc.result.frontmatter ? ` |
||||
<div class="metadata-grid"> |
||||
${Object.entries(asciidoc.result.frontmatter).map(([key, value]) => ` |
||||
<div class="metadata-item"> |
||||
<strong>${escapeHtml(key)}</strong> |
||||
<code>${escapeHtml(JSON.stringify(value))}</code> |
||||
</div> |
||||
`).join('')}
|
||||
</div> |
||||
` : '<p><em>No frontmatter found</em></p>'}
|
||||
</div> |
||||
|
||||
<div id="ad-original" class="tab-content"> |
||||
<h3>Original AsciiDoc Content</h3> |
||||
<div class="code-block"> |
||||
<pre>${escapeHtml(asciidoc.original)}</pre> |
||||
</div> |
||||
</div> |
||||
|
||||
<div id="ad-rendered" class="tab-content"> |
||||
<h3>Rendered HTML Output</h3> |
||||
<div class="rendered-output"> |
||||
${cleanHtmlContent(asciidoc.result.content)} |
||||
</div> |
||||
<details style="margin-top: 15px;"> |
||||
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML (Final Processed State)</summary> |
||||
<div class="code-block" style="margin-top: 10px;"> |
||||
<pre>${escapeHtml(asciidoc.result.content)}</pre> |
||||
</div> |
||||
</details> |
||||
</div> |
||||
|
||||
<div id="ad-metadata" class="tab-content"> |
||||
<h3>Extracted Metadata</h3> |
||||
|
||||
${asciidoc.result.nostrLinks.length > 0 ? ` |
||||
<h4>Nostr Links (${asciidoc.result.nostrLinks.length})</h4> |
||||
${asciidoc.result.nostrLinks.map((link: any) => ` |
||||
<div class="list-item"> |
||||
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code> |
||||
${link.text ? ` - ${escapeHtml(link.text)}` : ''} |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${asciidoc.result.wikilinks.length > 0 ? ` |
||||
<h4>Wikilinks (${asciidoc.result.wikilinks.length})</h4> |
||||
${asciidoc.result.wikilinks.map((wl: any) => ` |
||||
<div class="list-item"> |
||||
<code>${escapeHtml(wl.original)}</code> → dtag: <code>${escapeHtml(wl.dtag)}</code> |
||||
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''} |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${asciidoc.result.hashtags.length > 0 ? ` |
||||
<h4>Hashtags (${asciidoc.result.hashtags.length})</h4> |
||||
${asciidoc.result.hashtags.map((tag: string) => ` |
||||
<div class="list-item"> |
||||
<code>#${escapeHtml(tag)}</code> |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${asciidoc.result.links.length > 0 ? ` |
||||
<h4>Links (${asciidoc.result.links.length})</h4> |
||||
${asciidoc.result.links.map((link: any) => ` |
||||
<div class="list-item"> |
||||
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a> |
||||
${link.isExternal ? '<span class="warning-badge">External</span>' : ''} |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${asciidoc.result.media.length > 0 ? ` |
||||
<h4>Media URLs (${asciidoc.result.media.length})</h4> |
||||
${asciidoc.result.media.map((url: string) => ` |
||||
<div class="list-item"> |
||||
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a> |
||||
</div> |
||||
`).join('')}
|
||||
` : ''}
|
||||
|
||||
${asciidoc.result.tableOfContents ? ` |
||||
<h4>Table of Contents</h4> |
||||
<div class="rendered-output"> |
||||
${asciidoc.result.tableOfContents} |
||||
</div> |
||||
` : ''}
|
||||
</div> |
||||
</div> |
||||
</div> |
||||
|
||||
<script> |
||||
function showTab(tabId) { |
||||
// Hide all tab contents
|
||||
const allContents = document.querySelectorAll('.tab-content'); |
||||
allContents.forEach(content => content.classList.remove('active')); |
||||
|
||||
// Remove active class from all tabs
|
||||
const allTabs = document.querySelectorAll('.tab'); |
||||
allTabs.forEach(tab => tab.classList.remove('active')); |
||||
|
||||
// Show selected tab content
|
||||
const selectedContent = document.getElementById(tabId); |
||||
if (selectedContent) { |
||||
selectedContent.classList.add('active'); |
||||
} |
||||
|
||||
// Add active class to clicked tab
|
||||
event.target.classList.add('active'); |
||||
} |
||||
</script> |
||||
</body> |
||||
</html>`;
|
||||
} |
||||
|
||||
/** |
||||
* Clean HTML content to extract only the body content |
||||
* Removes full HTML document structure if present |
||||
* Prevents infinite loops by ensuring we only extract once and handle nested structures |
||||
* Also detects and prevents content duplication (doom loops) |
||||
*/ |
||||
function cleanHtmlContent(html: string): string { |
||||
if (!html || typeof html !== 'string') { |
||||
return ''; |
||||
} |
||||
|
||||
let cleaned = html.trim(); |
||||
|
||||
// Count occurrences to detect nested structures
|
||||
const htmlTagCount = (cleaned.match(/<html[^>]*>/gi) || []).length; |
||||
const bodyTagCount = (cleaned.match(/<body[^>]*>/gi) || []).length; |
||||
const bodyCloseCount = (cleaned.match(/<\/body>/gi) || []).length; |
||||
|
||||
// If we have multiple body tags, there might be nested structures
|
||||
// Extract only the outermost body content
|
||||
if (bodyTagCount > 0 && bodyCloseCount > 0) { |
||||
// Find the first <body> tag
|
||||
const firstBodyIndex = cleaned.indexOf('<body'); |
||||
if (firstBodyIndex !== -1) { |
||||
// Find the opening > of the first body tag
|
||||
const bodyTagEnd = cleaned.indexOf('>', firstBodyIndex); |
||||
if (bodyTagEnd !== -1) { |
||||
const bodyStart = bodyTagEnd + 1; |
||||
// Find the last </body> tag (to handle nested structures)
|
||||
const bodyEnd = cleaned.lastIndexOf('</body>'); |
||||
|
||||
if (bodyEnd > bodyStart) { |
||||
cleaned = cleaned.substring(bodyStart, bodyEnd).trim(); |
||||
|
||||
// Recursively clean if there are still nested structures
|
||||
// But limit recursion to prevent infinite loops
|
||||
const remainingBodyTags = (cleaned.match(/<body[^>]*>/gi) || []).length; |
||||
if (remainingBodyTags > 0 && remainingBodyTags < bodyTagCount) { |
||||
// There are still nested body tags, clean again but only once more
|
||||
cleaned = cleaned.replace(/<body[^>]*>/gi, ''); |
||||
cleaned = cleaned.replace(/<\/body>/gi, ''); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Remove any remaining DOCTYPE, html, head, or body tags that might be left
|
||||
// Do this in a way that doesn't create nested matches
|
||||
let previousLength = 0; |
||||
let iterations = 0; |
||||
while (iterations < 10 && cleaned.length !== previousLength) { |
||||
previousLength = cleaned.length; |
||||
cleaned = cleaned.replace(/<!DOCTYPE[^>]*>/gi, ''); |
||||
cleaned = cleaned.replace(/<html[^>]*>/gi, ''); |
||||
cleaned = cleaned.replace(/<\/html>/gi, ''); |
||||
cleaned = cleaned.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, ''); |
||||
cleaned = cleaned.replace(/<body[^>]*>/gi, ''); |
||||
cleaned = cleaned.replace(/<\/body>/gi, ''); |
||||
cleaned = cleaned.trim(); |
||||
iterations++; |
||||
} |
||||
|
||||
// Detect and prevent content duplication (doom loops)
|
||||
// Strategy: Use a fingerprint of the first part of the content to detect repetition
|
||||
|
||||
// Create a fingerprint from the first meaningful chunk (skip leading whitespace/tags)
|
||||
const contentStart = cleaned.search(/[^\s<]/); |
||||
if (contentStart !== -1) { |
||||
// Use first 2000 characters as fingerprint, or 1/4 of content, whichever is smaller
|
||||
const fingerprintLength = Math.min(2000, Math.max(500, Math.floor(cleaned.length / 4))); |
||||
const fingerprint = cleaned.substring(contentStart, contentStart + fingerprintLength); |
||||
|
||||
// Find where this fingerprint repeats
|
||||
const secondOccurrence = cleaned.indexOf(fingerprint, contentStart + fingerprintLength); |
||||
|
||||
if (secondOccurrence !== -1 && secondOccurrence < cleaned.length * 0.85) { |
||||
// Content is clearly duplicated - return only the first occurrence
|
||||
cleaned = cleaned.substring(0, secondOccurrence).trim(); |
||||
return cleaned; |
||||
} |
||||
} |
||||
|
||||
// Additional check: detect repeated patterns using common document markers
|
||||
const documentMarkers = [ |
||||
/#\s+Markdown\s+Test\s+Document/gi, |
||||
/==\s+Bullet\s+list/gi, |
||||
/##\s+Bullet\s+list/gi, |
||||
]; |
||||
|
||||
for (const marker of documentMarkers) { |
||||
const matches = cleaned.match(marker); |
||||
if (matches && matches.length > 1) { |
||||
const firstMatch = cleaned.search(marker); |
||||
if (firstMatch !== -1) { |
||||
// Get a chunk starting from this marker
|
||||
const chunkStart = firstMatch; |
||||
const chunkLength = Math.min(1500, Math.floor(cleaned.length / 3)); |
||||
const chunk = cleaned.substring(chunkStart, chunkStart + chunkLength); |
||||
|
||||
// Find where this chunk repeats
|
||||
const secondChunk = cleaned.indexOf(chunk, chunkStart + chunkLength); |
||||
|
||||
if (secondChunk !== -1 && secondChunk < cleaned.length * 0.9) { |
||||
// Content repeats here - truncate
|
||||
cleaned = cleaned.substring(0, secondChunk).trim(); |
||||
return cleaned; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Final check: detect repeated section headers
|
||||
const sectionHeaderPattern = /(?:^|\n)(?:##?|==)\s+[^\n<]+/gm; |
||||
const sectionHeaders: string[] = []; |
||||
let match; |
||||
|
||||
while ((match = sectionHeaderPattern.exec(cleaned)) !== null) { |
||||
sectionHeaders.push(match[0].trim()); |
||||
} |
||||
|
||||
// If we have many headers, check for repetition
|
||||
if (sectionHeaders.length > 8) { |
||||
const uniqueHeaders = new Set(sectionHeaders); |
||||
// If we have way more headers than unique ones, content is repeating
|
||||
if (sectionHeaders.length > uniqueHeaders.size * 2.5) { |
||||
// Find the first occurrence of each unique header
|
||||
const uniqueHeaderArray = Array.from(uniqueHeaders); |
||||
const firstUniqueHeader = uniqueHeaderArray[0]; |
||||
const firstHeaderIndex = cleaned.indexOf(firstUniqueHeader); |
||||
|
||||
if (firstHeaderIndex !== -1) { |
||||
// Find the second occurrence of the first header
|
||||
const secondHeaderIndex = cleaned.indexOf(firstUniqueHeader, firstHeaderIndex + 200); |
||||
|
||||
if (secondHeaderIndex !== -1 && secondHeaderIndex < cleaned.length * 0.85) { |
||||
// Content repeats here - truncate
|
||||
cleaned = cleaned.substring(0, secondHeaderIndex).trim(); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
return cleaned; |
||||
} |
||||
|
||||
/** |
||||
* Escape HTML special characters |
||||
*/ |
||||
export function escapeHtml(text: string): string { |
||||
const map: Record<string, string> = { |
||||
'&': '&', |
||||
'<': '<', |
||||
'>': '>', |
||||
'"': '"', |
||||
"'": ''', |
||||
}; |
||||
return text.replace(/[&<>"']/g, (m) => map[m]); |
||||
} |
||||
Loading…
Reference in new issue