45 changed files with 1687 additions and 16369 deletions
@ -1,72 +0,0 @@ |
|||||||
|
|
||||||
|
|
||||||
This is a test unordered list with mixed bullets: |
|
||||||
|
|
||||||
* First item with a number 2. in it |
|
||||||
* Second item |
|
||||||
* Third item |
|
||||||
* Indented item |
|
||||||
* Indented item |
|
||||||
* Fourth item |
|
||||||
|
|
||||||
|
|
||||||
Another unordered list: |
|
||||||
|
|
||||||
* 1st item |
|
||||||
* 2nd item |
|
||||||
* third item containing _italic_ text |
|
||||||
* indented item |
|
||||||
* second indented item |
|
||||||
* fourth item |
|
||||||
|
|
||||||
|
|
||||||
This is a test ordered list with indented items: |
|
||||||
|
|
||||||
. First item |
|
||||||
. Second item |
|
||||||
. Third item |
|
||||||
. Indented item |
|
||||||
. Indented item |
|
||||||
. Fourth item |
|
||||||
|
|
||||||
|
|
||||||
Ordered list where everything has the same number: |
|
||||||
|
|
||||||
. First item |
|
||||||
. Second item |
|
||||||
. Third item |
|
||||||
. Fourth item |
|
||||||
|
|
||||||
|
|
||||||
Ordered list that is wrongly numbered: |
|
||||||
|
|
||||||
. First item |
|
||||||
. Second item |
|
||||||
. Third item |
|
||||||
. Fourth item |
|
||||||
|
|
||||||
|
|
||||||
This is a mixed list with indented items: |
|
||||||
|
|
||||||
. First item |
|
||||||
. Second item |
|
||||||
. Third item |
|
||||||
|
|
||||||
* Indented item |
|
||||||
* Indented item |
|
||||||
|
|
||||||
. Fourth item |
|
||||||
|
|
||||||
|
|
||||||
This is another mixed list with indented items: |
|
||||||
|
|
||||||
* First item |
|
||||||
* Second item |
|
||||||
* Third item |
|
||||||
|
|
||||||
. Indented item |
|
||||||
. Indented item |
|
||||||
|
|
||||||
* Fourth item |
|
||||||
|
|
||||||
|
|
||||||
@ -1,27 +0,0 @@ |
|||||||
import { convertToAsciidoc } from './src/converters/to-asciidoc'; |
|
||||||
import { detectFormat } from './src/detector'; |
|
||||||
import * as fs from 'fs'; |
|
||||||
import * as path from 'path'; |
|
||||||
|
|
||||||
// Read just the list section from markdown test doc
|
|
||||||
const markdownContent = fs.readFileSync( |
|
||||||
path.join(__dirname, 'markdown_testdoc.md'), |
|
||||||
'utf-8' |
|
||||||
); |
|
||||||
|
|
||||||
// Extract just the list sections
|
|
||||||
const listSection = markdownContent.split('## Bullet list')[1]?.split('##')[0] || markdownContent; |
|
||||||
|
|
||||||
console.log('=== ORIGINAL MARKDOWN ==='); |
|
||||||
console.log(listSection); |
|
||||||
console.log('\n=== DETECTED FORMAT ==='); |
|
||||||
const format = detectFormat(listSection); |
|
||||||
console.log(format); |
|
||||||
|
|
||||||
console.log('\n=== CONVERTED ASCIIDOC ==='); |
|
||||||
const asciidoc = convertToAsciidoc(listSection, format, '', {}); |
|
||||||
console.log(asciidoc); |
|
||||||
|
|
||||||
// Write to file for inspection
|
|
||||||
fs.writeFileSync(path.join(__dirname, 'debug-asciidoc-output.adoc'), asciidoc); |
|
||||||
console.log('\n=== Written to debug-asciidoc-output.adoc ==='); |
|
||||||
@ -1,55 +0,0 @@ |
|||||||
#!/usr/bin/env node
|
|
||||||
|
|
||||||
/** |
|
||||||
* Example usage of gc-parser |
|
||||||
* This can be called from Go or used directly in Node.js |
|
||||||
*/ |
|
||||||
|
|
||||||
const { Parser, defaultOptions } = require('./dist/index.js'); |
|
||||||
|
|
||||||
async function main() { |
|
||||||
// Create parser with default options
|
|
||||||
const opts = defaultOptions(); |
|
||||||
opts.linkBaseURL = process.env.LINK_BASE_URL || 'https://example.com'; |
|
||||||
|
|
||||||
const parser = new Parser(opts); |
|
||||||
|
|
||||||
// Get content from command line argument or stdin
|
|
||||||
let content = ''; |
|
||||||
if (process.argv[2]) { |
|
||||||
content = process.argv[2]; |
|
||||||
} else { |
|
||||||
// Read from stdin
|
|
||||||
const readline = require('readline'); |
|
||||||
const rl = readline.createInterface({ |
|
||||||
input: process.stdin, |
|
||||||
output: process.stdout, |
|
||||||
terminal: false |
|
||||||
}); |
|
||||||
|
|
||||||
for await (const line of rl) { |
|
||||||
content += line + '\n'; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if (!content) { |
|
||||||
console.error('No content provided'); |
|
||||||
process.exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
try { |
|
||||||
const result = await parser.process(content); |
|
||||||
|
|
||||||
// Output as JSON for easy parsing
|
|
||||||
console.log(JSON.stringify(result, null, 2)); |
|
||||||
} catch (error) { |
|
||||||
console.error('Error processing content:', error); |
|
||||||
process.exit(1); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if (require.main === module) { |
|
||||||
main(); |
|
||||||
} |
|
||||||
|
|
||||||
module.exports = { main }; |
|
||||||
@ -1,2 +0,0 @@ |
|||||||
export {}; |
|
||||||
//# sourceMappingURL=generate-test-report.d.ts.map
|
|
||||||
@ -1 +0,0 @@ |
|||||||
{"version":3,"file":"generate-test-report.d.ts","sourceRoot":"","sources":["generate-test-report.ts"],"names":[],"mappings":""} |
|
||||||
@ -1,91 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { |
|
||||||
if (k2 === undefined) k2 = k; |
|
||||||
var desc = Object.getOwnPropertyDescriptor(m, k); |
|
||||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { |
|
||||||
desc = { enumerable: true, get: function() { return m[k]; } }; |
|
||||||
} |
|
||||||
Object.defineProperty(o, k2, desc); |
|
||||||
}) : (function(o, m, k, k2) { |
|
||||||
if (k2 === undefined) k2 = k; |
|
||||||
o[k2] = m[k]; |
|
||||||
})); |
|
||||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { |
|
||||||
Object.defineProperty(o, "default", { enumerable: true, value: v }); |
|
||||||
}) : function(o, v) { |
|
||||||
o["default"] = v; |
|
||||||
}); |
|
||||||
var __importStar = (this && this.__importStar) || (function () { |
|
||||||
var ownKeys = function(o) { |
|
||||||
ownKeys = Object.getOwnPropertyNames || function (o) { |
|
||||||
var ar = []; |
|
||||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; |
|
||||||
return ar; |
|
||||||
}; |
|
||||||
return ownKeys(o); |
|
||||||
}; |
|
||||||
return function (mod) { |
|
||||||
if (mod && mod.__esModule) return mod; |
|
||||||
var result = {}; |
|
||||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); |
|
||||||
__setModuleDefault(result, mod); |
|
||||||
return result; |
|
||||||
}; |
|
||||||
})(); |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
const parser_1 = require("./src/parser"); |
|
||||||
const report_generator_1 = require("./src/utils/report-generator"); |
|
||||||
const fs = __importStar(require("fs")); |
|
||||||
const path = __importStar(require("path")); |
|
||||||
/** |
|
||||||
* Standalone script to generate HTML test report |
|
||||||
* Run with: npm run test:report |
|
||||||
*/ |
|
||||||
async function main() { |
|
||||||
console.log('📝 Generating test report...\n'); |
|
||||||
// Initialize parser
|
|
||||||
const parser = new parser_1.Parser({ |
|
||||||
linkBaseURL: 'https://example.com', |
|
||||||
wikilinkUrl: '/events?d={dtag}', |
|
||||||
hashtagUrl: '/notes?t={topic}', |
|
||||||
}); |
|
||||||
// Read test documents
|
|
||||||
const markdownPath = path.join(__dirname, 'markdown_testdoc.md'); |
|
||||||
const asciidocPath = path.join(__dirname, 'asciidoc_testdoc.adoc'); |
|
||||||
if (!fs.existsSync(markdownPath)) { |
|
||||||
console.error(`❌ Error: ${markdownPath} not found`); |
|
||||||
process.exit(1); |
|
||||||
} |
|
||||||
if (!fs.existsSync(asciidocPath)) { |
|
||||||
console.error(`❌ Error: ${asciidocPath} not found`); |
|
||||||
process.exit(1); |
|
||||||
} |
|
||||||
const markdownContent = fs.readFileSync(markdownPath, 'utf-8'); |
|
||||||
const asciidocContent = fs.readFileSync(asciidocPath, 'utf-8'); |
|
||||||
console.log('📄 Parsing markdown document...'); |
|
||||||
const markdownResult = await parser.process(markdownContent); |
|
||||||
console.log('📄 Parsing asciidoc document...'); |
|
||||||
const asciidocResult = await parser.process(asciidocContent); |
|
||||||
console.log('🎨 Generating HTML report...'); |
|
||||||
const htmlReport = (0, report_generator_1.generateHTMLReport)({ |
|
||||||
markdown: { |
|
||||||
original: markdownContent, |
|
||||||
result: markdownResult, |
|
||||||
}, |
|
||||||
asciidoc: { |
|
||||||
original: asciidocContent, |
|
||||||
result: asciidocResult, |
|
||||||
}, |
|
||||||
}); |
|
||||||
// Write HTML report to file
|
|
||||||
const reportPath = path.join(__dirname, 'test-report.html'); |
|
||||||
fs.writeFileSync(reportPath, htmlReport, 'utf-8'); |
|
||||||
console.log(`\n✅ Test report generated: ${reportPath}`); |
|
||||||
console.log(` Open this file in your browser to view the results.\n`); |
|
||||||
} |
|
||||||
// Run the script
|
|
||||||
main().catch((error) => { |
|
||||||
console.error('❌ Error generating test report:', error); |
|
||||||
process.exit(1); |
|
||||||
}); |
|
||||||
//# sourceMappingURL=generate-test-report.js.map
|
|
||||||
@ -1 +0,0 @@ |
|||||||
{"version":3,"file":"generate-test-report.js","sourceRoot":"","sources":["generate-test-report.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,yCAAsC;AACtC,mEAA8E;AAC9E,uCAAyB;AACzB,2CAA6B;AAE7B;;;GAGG;AAEH,KAAK,UAAU,IAAI;IACjB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;IAE9C,oBAAoB;IACpB,MAAM,MAAM,GAAG,IAAI,eAAM,CAAC;QACxB,WAAW,EAAE,qBAAqB;QAClC,WAAW,EAAE,kBAAkB;QAC/B,UAAU,EAAE,kBAAkB;KAC/B,CAAC,CAAC;IAEH,sBAAsB;IACtB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,qBAAqB,CAAC,CAAC;IACjE,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,uBAAuB,CAAC,CAAC;IAEnE,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,YAAY,YAAY,YAAY,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,YAAY,YAAY,YAAY,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,eAAe,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,MAAM,eAAe,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAE/D,OAAO,CAAC,GAAG,CAAC,iCAAiC,CAAC,CAAC;IAC/C,MAAM,cAAc,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7D,OAAO,CAAC,GAAG,CAAC,iCAAiC,CAAC,CAAC;IAC/C,MAAM,cAAc,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7D,OAAO,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC;IAC5C,MAAM,UAAU,GAAG,IAAA,qCAAkB,EAAC;QACpC,QAAQ,EAAE;YACR,QAAQ,EAAE,eAAe;YACzB,MAAM,EAAE,cAAc;SACvB;QACD,QAAQ,EAAE;YACR,QAAQ,EAAE,eAAe;YACzB,MAAM,EAAE,cAAc;SACvB;KACF,CAAC,CAAC;IAEH,4BAA4B;IAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,kBAAkB,CAAC,CAAC;IAC5D,EAAE,CAAC,aAAa,CAAC,UAAU,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC;IAElD,OAAO,CAAC,GAAG,CAAC,8BAA8B,UAAU,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;AAC1E,CAAC;AAED,iBAAiB;AACjB,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACrB,OAAO,CAAC,KAAK,CAAC,iCAAiC,EAAE,KAAK,CAAC,CAAC;IACxD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"} |
|
||||||
@ -1,71 +0,0 @@ |
|||||||
// Import from source files - this script should be run with ts-node or similar
|
|
||||||
// from the project root, not from dist/
|
|
||||||
import { Parser } from './src/parser'; |
|
||||||
import { generateHTMLReport } from './src/utils/report-generator'; |
|
||||||
import * as fs from 'fs'; |
|
||||||
import * as path from 'path'; |
|
||||||
|
|
||||||
/** |
|
||||||
* Standalone script to generate HTML test report |
|
||||||
* Run with: npm run test:report |
|
||||||
*/ |
|
||||||
|
|
||||||
async function main() { |
|
||||||
console.log('📝 Generating test report...\n'); |
|
||||||
|
|
||||||
// Initialize parser
|
|
||||||
const parser = new Parser({ |
|
||||||
linkBaseURL: 'https://example.com', |
|
||||||
wikilinkUrl: '/events?d={dtag}', |
|
||||||
hashtagUrl: '/notes?t={topic}', |
|
||||||
}); |
|
||||||
|
|
||||||
// Read test documents from project root
|
|
||||||
const baseDir = __dirname.includes('dist') ? path.join(__dirname, '..') : __dirname; |
|
||||||
const markdownPath = path.join(baseDir, 'markdown_testdoc.md'); |
|
||||||
const asciidocPath = path.join(baseDir, 'asciidoc_testdoc.adoc'); |
|
||||||
|
|
||||||
if (!fs.existsSync(markdownPath)) { |
|
||||||
console.error(`❌ Error: ${markdownPath} not found`); |
|
||||||
process.exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
if (!fs.existsSync(asciidocPath)) { |
|
||||||
console.error(`❌ Error: ${asciidocPath} not found`); |
|
||||||
process.exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
const markdownContent = fs.readFileSync(markdownPath, 'utf-8'); |
|
||||||
const asciidocContent = fs.readFileSync(asciidocPath, 'utf-8'); |
|
||||||
|
|
||||||
console.log('📄 Parsing markdown document...'); |
|
||||||
const markdownResult = await parser.process(markdownContent); |
|
||||||
|
|
||||||
console.log('📄 Parsing asciidoc document...'); |
|
||||||
const asciidocResult = await parser.process(asciidocContent); |
|
||||||
|
|
||||||
console.log('🎨 Generating HTML report...'); |
|
||||||
const htmlReport = generateHTMLReport({ |
|
||||||
markdown: { |
|
||||||
original: markdownContent, |
|
||||||
result: markdownResult, |
|
||||||
}, |
|
||||||
asciidoc: { |
|
||||||
original: asciidocContent, |
|
||||||
result: asciidocResult, |
|
||||||
}, |
|
||||||
}); |
|
||||||
|
|
||||||
// Write HTML report to file (adjust path based on where script is run from)
|
|
||||||
const reportPath = path.join(baseDir, 'test-report.html'); |
|
||||||
fs.writeFileSync(reportPath, htmlReport, 'utf-8'); |
|
||||||
|
|
||||||
console.log(`\n✅ Test report generated: ${reportPath}`); |
|
||||||
console.log(` Open this file in your browser to view the results.\n`); |
|
||||||
} |
|
||||||
|
|
||||||
// Run the script
|
|
||||||
main().catch((error) => { |
|
||||||
console.error('❌ Error generating test report:', error); |
|
||||||
process.exit(1); |
|
||||||
}); |
|
||||||
@ -1,24 +1,23 @@ |
|||||||
module.exports = { |
module.exports = { |
||||||
preset: 'ts-jest', |
preset: 'ts-jest', |
||||||
testEnvironment: 'node', |
testEnvironment: 'node', |
||||||
roots: ['<rootDir>'], |
roots: ['<rootDir>/src'], |
||||||
testMatch: ['**/*.test.ts'], |
testMatch: ['**/__tests__/**/*.test.ts', '**/?(*.)+(spec|test).ts'], |
||||||
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'], |
testPathIgnorePatterns: ['/node_modules/', '/dist/', 'asciidoc.test.ts'], |
||||||
collectCoverageFrom: [ |
|
||||||
'src/**/*.ts', |
|
||||||
'!src/**/*.d.ts', |
|
||||||
], |
|
||||||
transform: { |
transform: { |
||||||
'^.+\\.ts$': ['ts-jest', { |
'^.+\\.ts$': ['ts-jest', { |
||||||
tsconfig: 'tsconfig.test.json', |
tsconfig: { |
||||||
|
esModuleInterop: true, |
||||||
|
}, |
||||||
}], |
}], |
||||||
|
'^.+\\.js$': 'babel-jest', |
||||||
}, |
}, |
||||||
// Don't transform AsciiDoctor packages - they use Opal runtime which breaks with Jest transformation
|
moduleFileExtensions: ['ts', 'js', 'json'], |
||||||
// AsciiDoctor uses CommonJS and Opal runtime, so we need to exclude it from transformation
|
moduleNameMapper: { |
||||||
// The pattern matches paths to ignore (not transform)
|
'^marked$': '<rootDir>/node_modules/marked/lib/marked.umd.js', |
||||||
transformIgnorePatterns: [ |
}, |
||||||
'node_modules/(?!(@asciidoctor)/)', |
collectCoverageFrom: [ |
||||||
|
'src/**/*.ts', |
||||||
|
'!src/**/*.d.ts', |
||||||
], |
], |
||||||
// Ensure CommonJS modules are handled correctly
|
|
||||||
moduleNameMapper: {}, |
|
||||||
}; |
}; |
||||||
|
|||||||
@ -0,0 +1,353 @@ |
|||||||
|
import { Parser } from '../parser'; |
||||||
|
import { readFileSync, writeFileSync, mkdirSync } from 'fs'; |
||||||
|
import { join } from 'path'; |
||||||
|
|
||||||
|
/** |
||||||
|
* Simple test runner for AsciiDoc tests (separate from Jest due to Opal compatibility issues) |
||||||
|
*/ |
||||||
|
async function runAsciiDocTests() { |
||||||
|
console.log('Running AsciiDoc tests...\n'); |
||||||
|
|
||||||
|
const asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8'); |
||||||
|
const parser = new Parser({ |
||||||
|
linkBaseURL: 'https://example.com', |
||||||
|
enableNostrAddresses: true, |
||||||
|
wikilinkUrl: '/events?d={dtag}', |
||||||
|
hashtagUrl: '/hashtag/{topic}' |
||||||
|
}); |
||||||
|
|
||||||
|
let passed = 0; |
||||||
|
let failed = 0; |
||||||
|
const failures: string[] = []; |
||||||
|
|
||||||
|
const testPromises: Promise<void>[] = []; |
||||||
|
|
||||||
|
function test(name: string, fn: () => void | Promise<void>) { |
||||||
|
const testPromise = (async () => { |
||||||
|
try { |
||||||
|
const result = fn(); |
||||||
|
if (result instanceof Promise) { |
||||||
|
await result; |
||||||
|
} |
||||||
|
passed++; |
||||||
|
console.log(`✓ ${name}`); |
||||||
|
} catch (error: any) { |
||||||
|
failed++; |
||||||
|
failures.push(`${name}: ${error.message}`); |
||||||
|
console.error(`✗ ${name}: ${error.message}`); |
||||||
|
} |
||||||
|
})(); |
||||||
|
testPromises.push(testPromise); |
||||||
|
} |
||||||
|
|
||||||
|
function expect(actual: any) { |
||||||
|
return { |
||||||
|
toBeDefined: () => { |
||||||
|
if (actual === undefined || actual === null) { |
||||||
|
throw new Error(`Expected value to be defined, but got ${actual}`); |
||||||
|
} |
||||||
|
}, |
||||||
|
toBe: (expected: any) => { |
||||||
|
if (actual !== expected) { |
||||||
|
throw new Error(`Expected ${expected}, but got ${actual}`); |
||||||
|
} |
||||||
|
}, |
||||||
|
toContain: (substring: string) => { |
||||||
|
if (typeof actual === 'string' && !actual.includes(substring)) { |
||||||
|
throw new Error(`Expected string to contain "${substring}"`); |
||||||
|
} |
||||||
|
}, |
||||||
|
toMatch: (regex: RegExp) => { |
||||||
|
if (typeof actual === 'string' && !regex.test(actual)) { |
||||||
|
throw new Error(`Expected string to match ${regex}`); |
||||||
|
} |
||||||
|
}, |
||||||
|
toHaveProperty: (prop: string) => { |
||||||
|
if (!(prop in actual)) { |
||||||
|
throw new Error(`Expected object to have property "${prop}"`); |
||||||
|
} |
||||||
|
}, |
||||||
|
toBeGreaterThan: (value: number) => { |
||||||
|
if (typeof actual !== 'number' || actual <= value) { |
||||||
|
throw new Error(`Expected ${actual} to be greater than ${value}`); |
||||||
|
} |
||||||
|
}, |
||||||
|
length: { |
||||||
|
toBeGreaterThan: (value: number) => { |
||||||
|
if (!Array.isArray(actual) || actual.length <= value) { |
||||||
|
throw new Error(`Expected array length to be greater than ${value}, but got ${actual.length}`); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
}; |
||||||
|
} |
||||||
|
|
||||||
|
// Run tests
|
||||||
|
const result = await parser.process(asciidocContent); |
||||||
|
|
||||||
|
// Write HTML output to file for inspection
|
||||||
|
const outputDir = join(__dirname, '../../test-output'); |
||||||
|
try { |
||||||
|
mkdirSync(outputDir, { recursive: true }); |
||||||
|
} catch (e) { |
||||||
|
// Directory might already exist
|
||||||
|
} |
||||||
|
|
||||||
|
const htmlOutput = `<!DOCTYPE html>
|
||||||
|
<html lang="en"> |
||||||
|
<head> |
||||||
|
<meta charset="UTF-8"> |
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
||||||
|
<meta name="referrer" content="strict-origin-when-cross-origin"> |
||||||
|
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'unsafe-inline' 'unsafe-eval' https://www.youtube.com https://s.ytimg.com https://www.gstatic.com https://*.googlevideo.com; frame-src https://www.youtube.com https://youtube.com https://open.spotify.com https://*.googlevideo.com; style-src 'unsafe-inline'; img-src 'self' data: https:; media-src 'self' https:; connect-src https:; child-src https://www.youtube.com https://youtube.com;"> |
||||||
|
<title>AsciiDoc Test Output</title> |
||||||
|
<style> |
||||||
|
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; } |
||||||
|
.hashtag { color: #1da1f2; font-weight: 500; } |
||||||
|
.wikilink { color: #0066cc; text-decoration: underline; } |
||||||
|
.nostr-link { color: #8b5cf6; text-decoration: underline; } |
||||||
|
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; } |
||||||
|
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; } |
||||||
|
.line-through { text-decoration: line-through; } |
||||||
|
.highlight { background-color: #ffeb3b; padding: 2px 4px; border-radius: 3px; } |
||||||
|
.bare-image { max-width: 100%; width: auto; height: auto; margin: 10px 0; display: block; } |
||||||
|
.bare-video, .bare-audio { width: 100%; max-width: 800px; margin: 10px 0; display: block; } |
||||||
|
.youtube-embed, .spotify-embed { max-width: 100%; margin: 10px 0; border-radius: 8px; display: block; } |
||||||
|
.youtube-embed { width: 100%; max-width: 640px; height: auto; aspect-ratio: 16/9; border: 0; display: block; } |
||||||
|
.spotify-embed { width: 100%; max-width: 800px; } |
||||||
|
/* Table styles */ |
||||||
|
table { border-collapse: collapse; width: 100%; margin: 1em 0; } |
||||||
|
table thead { background-color: #f2f2f2; } |
||||||
|
table th { font-weight: bold; padding: 8px; border: 1px solid #ddd; background-color: #f2f2f2; } |
||||||
|
table td { padding: 8px; border: 1px solid #ddd; } |
||||||
|
/* Alignment classes - AsciiDoc uses halign-* and valign-* classes */ |
||||||
|
.halign-left { text-align: left !important; } |
||||||
|
.halign-center { text-align: center !important; } |
||||||
|
.halign-right { text-align: right !important; } |
||||||
|
.valign-top { vertical-align: top !important; } |
||||||
|
.valign-middle { vertical-align: middle !important; } |
||||||
|
.valign-bottom { vertical-align: bottom !important; } |
||||||
|
/* Also handle tableblock classes */ |
||||||
|
.tableblock.halign-left { text-align: left !important; } |
||||||
|
.tableblock.halign-center { text-align: center !important; } |
||||||
|
.tableblock.halign-right { text-align: right !important; } |
||||||
|
.tableblock.valign-top { vertical-align: top !important; } |
||||||
|
.tableblock.valign-middle { vertical-align: middle !important; } |
||||||
|
.tableblock.valign-bottom { vertical-align: bottom !important; } |
||||||
|
/* Task list styles */ |
||||||
|
.checklist { list-style: none; padding-left: 0; } |
||||||
|
.checklist li { padding-left: 1.5em; position: relative; margin: 0.5em 0; } |
||||||
|
.checklist li i.fa-check-square-o::before { content: "☑ "; font-style: normal; font-family: sans-serif; } |
||||||
|
.checklist li i.fa-square-o::before { content: "☐ "; font-style: normal; font-family: sans-serif; } |
||||||
|
.checklist li i { position: absolute; left: 0; font-style: normal; } |
||||||
|
/* Fallback if Font Awesome doesn't load */ |
||||||
|
.checklist li i.fa-check-square-o { display: inline-block; width: 1em; } |
||||||
|
.checklist li i.fa-check-square-o:before { content: "☑"; } |
||||||
|
.checklist li i.fa-square-o { display: inline-block; width: 1em; } |
||||||
|
.checklist li i.fa-square-o:before { content: "☐"; } |
||||||
|
/* AsciiDoc specific styles */ |
||||||
|
.sect1, .sect2, .sect3, .sect4, .sect5 { margin-top: 1.5em; margin-bottom: 1em; } |
||||||
|
.paragraph { margin: 1em 0; } |
||||||
|
table { border-collapse: collapse; width: 100%; margin: 1em 0; } |
||||||
|
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; } |
||||||
|
table th { background-color: #f2f2f2; } |
||||||
|
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; } |
||||||
|
</style> |
||||||
|
</head> |
||||||
|
<body> |
||||||
|
<h1>AsciiDoc Test Document - Parsed Output</h1> |
||||||
|
<hr> |
||||||
|
${result.content} |
||||||
|
<hr> |
||||||
|
<h2>Metadata</h2> |
||||||
|
<pre>${JSON.stringify({ |
||||||
|
hasLaTeX: result.hasLaTeX, |
||||||
|
hasMusicalNotation: result.hasMusicalNotation, |
||||||
|
nostrLinks: result.nostrLinks, |
||||||
|
wikilinks: result.wikilinks, |
||||||
|
hashtags: result.hashtags, |
||||||
|
links: result.links, |
||||||
|
media: result.media |
||||||
|
}, null, 2)}</pre> |
||||||
|
</body> |
||||||
|
</html>`;
|
||||||
|
|
||||||
|
const outputPath = join(outputDir, 'asciidoc-output.html'); |
||||||
|
writeFileSync(outputPath, htmlOutput, 'utf-8'); |
||||||
|
console.log(`\n📄 HTML output written to: ${outputPath}\n`); |
||||||
|
|
||||||
|
test('should parse AsciiDoc content', () => { |
||||||
|
expect(result).toBeDefined(); |
||||||
|
expect(result.content).toBeDefined(); |
||||||
|
expect(typeof result.content).toBe('string'); |
||||||
|
expect(result.content.length).toBeGreaterThan(0); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should have HTML content', () => { |
||||||
|
expect(result.content).toContain('<'); |
||||||
|
expect(result.content).toContain('>'); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should extract table of contents', () => { |
||||||
|
expect(result.tableOfContents).toBeDefined(); |
||||||
|
expect(typeof result.tableOfContents).toBe('string'); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should detect LaTeX', () => { |
||||||
|
expect(result.hasLaTeX).toBeDefined(); |
||||||
|
expect(typeof result.hasLaTeX).toBe('boolean'); |
||||||
|
expect(result.hasLaTeX).toBe(true); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should detect musical notation', () => { |
||||||
|
expect(result.hasMusicalNotation).toBeDefined(); |
||||||
|
expect(typeof result.hasMusicalNotation).toBe('boolean'); |
||||||
|
expect(result.hasMusicalNotation).toBe(true); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should extract nostr links', () => { |
||||||
|
expect(result.nostrLinks).toBeDefined(); |
||||||
|
expect(Array.isArray(result.nostrLinks)).toBe(true); |
||||||
|
expect(result.nostrLinks.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
const nostrLink = result.nostrLinks[0]; |
||||||
|
expect(nostrLink).toHaveProperty('type'); |
||||||
|
expect(nostrLink).toHaveProperty('id'); |
||||||
|
expect(nostrLink).toHaveProperty('text'); |
||||||
|
expect(nostrLink).toHaveProperty('bech32'); |
||||||
|
const validTypes = ['npub', 'nprofile', 'nevent', 'naddr', 'note']; |
||||||
|
if (!validTypes.includes(nostrLink.type)) { |
||||||
|
throw new Error(`Invalid nostr type: ${nostrLink.type}`); |
||||||
|
} |
||||||
|
}); |
||||||
|
|
||||||
|
test('should extract wikilinks', () => { |
||||||
|
expect(result.wikilinks).toBeDefined(); |
||||||
|
expect(Array.isArray(result.wikilinks)).toBe(true); |
||||||
|
expect(result.wikilinks.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
const wikilink = result.wikilinks[0]; |
||||||
|
expect(wikilink).toHaveProperty('dtag'); |
||||||
|
expect(wikilink).toHaveProperty('display'); |
||||||
|
expect(wikilink).toHaveProperty('original'); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should extract hashtags', () => { |
||||||
|
expect(result.hashtags).toBeDefined(); |
||||||
|
expect(Array.isArray(result.hashtags)).toBe(true); |
||||||
|
expect(result.hashtags.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
result.hashtags.forEach((tag: string) => { |
||||||
|
if (tag.includes('#')) { |
||||||
|
throw new Error(`Hashtag should not include #: ${tag}`); |
||||||
|
} |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should extract regular links', () => { |
||||||
|
expect(result.links).toBeDefined(); |
||||||
|
expect(Array.isArray(result.links)).toBe(true); |
||||||
|
|
||||||
|
if (result.links.length > 0) { |
||||||
|
const link = result.links[0]; |
||||||
|
expect(link).toHaveProperty('url'); |
||||||
|
expect(link).toHaveProperty('text'); |
||||||
|
expect(link).toHaveProperty('isExternal'); |
||||||
|
expect(typeof link.isExternal).toBe('boolean'); |
||||||
|
} |
||||||
|
}); |
||||||
|
|
||||||
|
test('should extract media URLs', () => { |
||||||
|
expect(result.media).toBeDefined(); |
||||||
|
expect(Array.isArray(result.media)).toBe(true); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should process nostr: addresses in HTML', () => { |
||||||
|
const nostrAddresses = result.nostrLinks; |
||||||
|
expect(nostrAddresses.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
nostrAddresses.forEach((link: any) => { |
||||||
|
if (!result.content.includes(`data-nostr-type="${link.type}"`)) { |
||||||
|
throw new Error(`Missing nostr type attribute for ${link.type}`); |
||||||
|
} |
||||||
|
if (!result.content.includes(`data-nostr-id="${link.bech32}"`)) { |
||||||
|
throw new Error(`Missing nostr id attribute for ${link.bech32}`); |
||||||
|
} |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should process wikilinks in HTML', () => { |
||||||
|
const wikilinks = result.wikilinks; |
||||||
|
expect(wikilinks.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
wikilinks.forEach((wikilink: any) => { |
||||||
|
if (!result.content.includes(`class="wikilink"`)) { |
||||||
|
throw new Error('Missing wikilink class'); |
||||||
|
} |
||||||
|
if (!result.content.includes(`data-dtag="${wikilink.dtag}"`)) { |
||||||
|
throw new Error(`Missing dtag attribute for ${wikilink.dtag}`); |
||||||
|
} |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should process hashtags in HTML', () => { |
||||||
|
const hashtags = result.hashtags; |
||||||
|
expect(hashtags.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
hashtags.forEach((tag: string) => { |
||||||
|
if (!result.content.includes(`data-topic="${tag}"`)) { |
||||||
|
throw new Error(`Missing topic attribute for ${tag}`); |
||||||
|
} |
||||||
|
if (!result.content.includes('class="hashtag"')) { |
||||||
|
throw new Error('Missing hashtag class'); |
||||||
|
} |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
test('should contain expected content sections', () => { |
||||||
|
if (!/Bullet list|bullet/i.test(result.content)) { |
||||||
|
throw new Error('Missing bullet list section'); |
||||||
|
} |
||||||
|
if (!/Headers|header/i.test(result.content)) { |
||||||
|
throw new Error('Missing headers section'); |
||||||
|
} |
||||||
|
if (!/Media and Links|media|links/i.test(result.content)) { |
||||||
|
throw new Error('Missing media and links section'); |
||||||
|
} |
||||||
|
}); |
||||||
|
|
||||||
|
test('should return consistent structure', () => { |
||||||
|
expect(result).toHaveProperty('content'); |
||||||
|
expect(result).toHaveProperty('tableOfContents'); |
||||||
|
expect(result).toHaveProperty('hasLaTeX'); |
||||||
|
expect(result).toHaveProperty('hasMusicalNotation'); |
||||||
|
expect(result).toHaveProperty('nostrLinks'); |
||||||
|
expect(result).toHaveProperty('wikilinks'); |
||||||
|
expect(result).toHaveProperty('hashtags'); |
||||||
|
expect(result).toHaveProperty('links'); |
||||||
|
expect(result).toHaveProperty('media'); |
||||||
|
}); |
||||||
|
|
||||||
|
// Wait for all tests to complete
|
||||||
|
await Promise.all(testPromises); |
||||||
|
|
||||||
|
// Print summary
|
||||||
|
console.log(`\n${'='.repeat(50)}`); |
||||||
|
console.log(`Tests passed: ${passed}`); |
||||||
|
console.log(`Tests failed: ${failed}`); |
||||||
|
|
||||||
|
if (failures.length > 0) { |
||||||
|
console.log('\nFailures:'); |
||||||
|
failures.forEach(f => console.error(` - ${f}`)); |
||||||
|
process.exit(1); |
||||||
|
} else { |
||||||
|
console.log('\nAll tests passed!'); |
||||||
|
process.exit(0); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Run tests
|
||||||
|
runAsciiDocTests().catch(error => { |
||||||
|
console.error('Test runner error:', error); |
||||||
|
process.exit(1); |
||||||
|
}); |
||||||
@ -0,0 +1,238 @@ |
|||||||
|
import { Parser } from '../parser'; |
||||||
|
import { readFileSync, writeFileSync, mkdirSync } from 'fs'; |
||||||
|
import { join } from 'path'; |
||||||
|
|
||||||
|
describe('Parser', () => { |
||||||
|
let asciidocContent: string; |
||||||
|
let markdownContent: string; |
||||||
|
|
||||||
|
beforeAll(() => { |
||||||
|
asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8'); |
||||||
|
markdownContent = readFileSync(join(__dirname, '../../markdown_testdoc.md'), 'utf-8'); |
||||||
|
}); |
||||||
|
|
||||||
|
// AsciiDoc tests are run separately using a Node.js script (asciidoc.test.ts)
|
||||||
|
// due to Jest/Opal runtime compatibility issues
|
||||||
|
// Run with: npm run test:asciidoc
|
||||||
|
|
||||||
|
describe('Markdown Test Document', () => { |
||||||
|
let result: any; |
||||||
|
|
||||||
|
beforeAll(async () => { |
||||||
|
const parser = new Parser({ |
||||||
|
linkBaseURL: 'https://example.com', |
||||||
|
enableNostrAddresses: true, |
||||||
|
wikilinkUrl: '/events?d={dtag}', |
||||||
|
hashtagUrl: '/hashtag/{topic}' |
||||||
|
}); |
||||||
|
result = await parser.process(markdownContent); |
||||||
|
|
||||||
|
// Write HTML output to file for inspection
|
||||||
|
const outputDir = join(__dirname, '../../test-output'); |
||||||
|
try { |
||||||
|
mkdirSync(outputDir, { recursive: true }); |
||||||
|
} catch (e) { |
||||||
|
// Directory might already exist
|
||||||
|
} |
||||||
|
|
||||||
|
const htmlOutput = `<!DOCTYPE html>
|
||||||
|
<html lang="en"> |
||||||
|
<head> |
||||||
|
<meta charset="UTF-8"> |
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
||||||
|
<title>Markdown Test Output</title> |
||||||
|
<style> |
||||||
|
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; } |
||||||
|
.hashtag { color: #1da1f2; font-weight: 500; } |
||||||
|
.wikilink { color: #0066cc; text-decoration: underline; } |
||||||
|
.nostr-link { color: #8b5cf6; text-decoration: underline; } |
||||||
|
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; } |
||||||
|
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; } |
||||||
|
.bare-image, .bare-video, .bare-audio { max-width: 100%; margin: 10px 0; } |
||||||
|
.bare-video, .bare-audio { width: 100%; max-width: 600px; } |
||||||
|
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; } |
||||||
|
table { border-collapse: collapse; width: 100%; margin: 1em 0; } |
||||||
|
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; } |
||||||
|
table th { background-color: #f2f2f2; } |
||||||
|
</style> |
||||||
|
</head> |
||||||
|
<body> |
||||||
|
<h1>Markdown Test Document - Parsed Output</h1> |
||||||
|
<hr> |
||||||
|
${result.content} |
||||||
|
<hr> |
||||||
|
<h2>Metadata</h2> |
||||||
|
<pre>${JSON.stringify({ |
||||||
|
frontmatter: result.frontmatter, |
||||||
|
hasLaTeX: result.hasLaTeX, |
||||||
|
hasMusicalNotation: result.hasMusicalNotation, |
||||||
|
nostrLinks: result.nostrLinks, |
||||||
|
wikilinks: result.wikilinks, |
||||||
|
hashtags: result.hashtags, |
||||||
|
links: result.links, |
||||||
|
media: result.media |
||||||
|
}, null, 2)}</pre> |
||||||
|
</body> |
||||||
|
</html>`;
|
||||||
|
|
||||||
|
const outputPath = join(outputDir, 'markdown-output.html'); |
||||||
|
writeFileSync(outputPath, htmlOutput, 'utf-8'); |
||||||
|
// Use console.info to ensure it shows in Jest output
|
||||||
|
console.info(`\n📄 HTML output written to: ${outputPath}\n`); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should parse Markdown content', () => { |
||||||
|
expect(result).toBeDefined(); |
||||||
|
expect(result.content).toBeDefined(); |
||||||
|
expect(typeof result.content).toBe('string'); |
||||||
|
expect(result.content.length).toBeGreaterThan(0); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should have HTML content', () => { |
||||||
|
expect(result.content).toContain('<'); |
||||||
|
expect(result.content).toContain('>'); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should extract frontmatter', () => { |
||||||
|
expect(result.frontmatter).toBeDefined(); |
||||||
|
expect(typeof result.frontmatter).toBe('object'); |
||||||
|
expect(result.frontmatter).toHaveProperty('author'); |
||||||
|
expect(result.frontmatter.author).toBe('James Smith'); |
||||||
|
expect(result.frontmatter).toHaveProperty('summary'); |
||||||
|
expect(result.frontmatter.summary).toBe('This is a summary'); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should detect LaTeX', () => { |
||||||
|
expect(result.hasLaTeX).toBeDefined(); |
||||||
|
expect(typeof result.hasLaTeX).toBe('boolean'); |
||||||
|
// The test doc has LaTeX, so it should be true
|
||||||
|
expect(result.hasLaTeX).toBe(true); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should detect musical notation', () => { |
||||||
|
expect(result.hasMusicalNotation).toBeDefined(); |
||||||
|
expect(typeof result.hasMusicalNotation).toBe('boolean'); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should extract nostr links', () => { |
||||||
|
expect(result.nostrLinks).toBeDefined(); |
||||||
|
expect(Array.isArray(result.nostrLinks)).toBe(true); |
||||||
|
expect(result.nostrLinks.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
// Check that nostr: addresses are extracted
|
||||||
|
const nostrLink = result.nostrLinks[0]; |
||||||
|
expect(nostrLink).toHaveProperty('type'); |
||||||
|
expect(nostrLink).toHaveProperty('id'); |
||||||
|
expect(nostrLink).toHaveProperty('text'); |
||||||
|
expect(nostrLink).toHaveProperty('bech32'); |
||||||
|
expect(['npub', 'nprofile', 'nevent', 'naddr', 'note']).toContain(nostrLink.type); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should extract wikilinks', () => { |
||||||
|
expect(result.wikilinks).toBeDefined(); |
||||||
|
expect(Array.isArray(result.wikilinks)).toBe(true); |
||||||
|
expect(result.wikilinks.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
// Check wikilink structure
|
||||||
|
const wikilink = result.wikilinks[0]; |
||||||
|
expect(wikilink).toHaveProperty('dtag'); |
||||||
|
expect(wikilink).toHaveProperty('display'); |
||||||
|
expect(wikilink).toHaveProperty('original'); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should extract hashtags', () => { |
||||||
|
expect(result.hashtags).toBeDefined(); |
||||||
|
expect(Array.isArray(result.hashtags)).toBe(true); |
||||||
|
expect(result.hashtags.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
// Hashtags should not include the # symbol
|
||||||
|
result.hashtags.forEach((tag: string) => { |
||||||
|
expect(tag).not.toContain('#'); |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should extract regular links', () => { |
||||||
|
expect(result.links).toBeDefined(); |
||||||
|
expect(Array.isArray(result.links)).toBe(true); |
||||||
|
|
||||||
|
if (result.links.length > 0) { |
||||||
|
const link = result.links[0]; |
||||||
|
expect(link).toHaveProperty('url'); |
||||||
|
expect(link).toHaveProperty('text'); |
||||||
|
expect(link).toHaveProperty('isExternal'); |
||||||
|
expect(typeof link.isExternal).toBe('boolean'); |
||||||
|
} |
||||||
|
}); |
||||||
|
|
||||||
|
it('should extract media URLs', () => { |
||||||
|
expect(result.media).toBeDefined(); |
||||||
|
expect(Array.isArray(result.media)).toBe(true); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should process nostr: addresses in HTML', () => { |
||||||
|
// Check that nostr: addresses are converted to links
|
||||||
|
const nostrAddresses = result.nostrLinks; |
||||||
|
expect(nostrAddresses.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
// Check that HTML contains links for nostr addresses
|
||||||
|
nostrAddresses.forEach((link: any) => { |
||||||
|
expect(result.content).toContain(`data-nostr-type="${link.type}"`); |
||||||
|
expect(result.content).toContain(`data-nostr-id="${link.bech32}"`); |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should process wikilinks in HTML', () => { |
||||||
|
// Check that wikilinks are converted to links
|
||||||
|
const wikilinks = result.wikilinks; |
||||||
|
expect(wikilinks.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
wikilinks.forEach((wikilink: any) => { |
||||||
|
expect(result.content).toContain(`class="wikilink"`); |
||||||
|
expect(result.content).toContain(`data-dtag="${wikilink.dtag}"`); |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should process hashtags in HTML', () => { |
||||||
|
// Check that hashtags are processed
|
||||||
|
const hashtags = result.hashtags; |
||||||
|
expect(hashtags.length).toBeGreaterThan(0); |
||||||
|
|
||||||
|
hashtags.forEach((tag: string) => { |
||||||
|
expect(result.content).toContain(`data-topic="${tag}"`); |
||||||
|
expect(result.content).toMatch(new RegExp(`class="hashtag"`)); |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should contain expected content sections', () => { |
||||||
|
// Check for some expected content from the test doc
|
||||||
|
expect(result.content).toMatch(/Bullet list|bullet/i); |
||||||
|
expect(result.content).toMatch(/Headers|header/i); |
||||||
|
expect(result.content).toMatch(/Media and Links|media|links/i); |
||||||
|
}); |
||||||
|
|
||||||
|
it('should have empty table of contents for markdown', () => { |
||||||
|
// Markdown doesn't generate TOC by default
|
||||||
|
expect(result.tableOfContents).toBeDefined(); |
||||||
|
expect(typeof result.tableOfContents).toBe('string'); |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
describe('Result structure validation', () => { |
||||||
|
|
||||||
|
it('should return consistent structure for Markdown', async () => { |
||||||
|
const parser = new Parser(); |
||||||
|
const result = await parser.process(markdownContent); |
||||||
|
|
||||||
|
// Check all required fields
|
||||||
|
expect(result).toHaveProperty('content'); |
||||||
|
expect(result).toHaveProperty('tableOfContents'); |
||||||
|
expect(result).toHaveProperty('hasLaTeX'); |
||||||
|
expect(result).toHaveProperty('hasMusicalNotation'); |
||||||
|
expect(result).toHaveProperty('nostrLinks'); |
||||||
|
expect(result).toHaveProperty('wikilinks'); |
||||||
|
expect(result).toHaveProperty('hashtags'); |
||||||
|
expect(result).toHaveProperty('links'); |
||||||
|
expect(result).toHaveProperty('media'); |
||||||
|
}); |
||||||
|
}); |
||||||
|
}); |
||||||
@ -1,330 +0,0 @@ |
|||||||
import { ContentFormat } from '../types'; |
|
||||||
|
|
||||||
export interface ConvertOptions { |
|
||||||
enableNostrAddresses?: boolean; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Converts content from various formats (Markdown, Wikipedia, Plain) to AsciiDoc |
|
||||||
*
|
|
||||||
* Processing order: |
|
||||||
* 1. Convert special syntax (wikilinks, hashtags, nostr links) to placeholders |
|
||||||
* 2. Process media URLs (YouTube, Spotify, video, audio) |
|
||||||
* 3. Process images (Markdown and bare URLs) |
|
||||||
* 4. Process links (Markdown and bare URLs) |
|
||||||
* 5. Clean URLs (remove tracking parameters) |
|
||||||
*/ |
|
||||||
export function convertToAsciidoc( |
|
||||||
content: string, |
|
||||||
format: ContentFormat, |
|
||||||
linkBaseURL?: string, |
|
||||||
options: ConvertOptions = {} |
|
||||||
): string { |
|
||||||
let processed = content; |
|
||||||
|
|
||||||
// Step 1: Convert special syntax to placeholders (before other processing)
|
|
||||||
processed = convertWikilinks(processed); |
|
||||||
processed = convertHashtags(processed); |
|
||||||
|
|
||||||
if (options.enableNostrAddresses !== false) { |
|
||||||
processed = convertNostrLinks(processed); |
|
||||||
} |
|
||||||
|
|
||||||
// Step 2: Process media URLs (before link processing to avoid conflicts)
|
|
||||||
processed = processMediaUrls(processed); |
|
||||||
|
|
||||||
// Step 3: Process images (before links to avoid conflicts)
|
|
||||||
processed = processImages(processed, format); |
|
||||||
|
|
||||||
// Step 4: Process links (Markdown and bare URLs)
|
|
||||||
processed = processLinks(processed, format); |
|
||||||
|
|
||||||
// Step 5: Convert format-specific syntax
|
|
||||||
if (format === ContentFormat.Markdown) { |
|
||||||
processed = convertMarkdownToAsciidoc(processed); |
|
||||||
} else if (format === ContentFormat.Wikipedia) { |
|
||||||
processed = convertWikipediaToAsciidoc(processed); |
|
||||||
} |
|
||||||
|
|
||||||
return processed; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert wikilinks [[target]] or [[target|display]] to WIKILINK:dtag|display |
|
||||||
*/ |
|
||||||
function convertWikilinks(content: string): string { |
|
||||||
return content.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, display) => { |
|
||||||
const dtag = normalizeDtag(target.trim()); |
|
||||||
const displayText = display ? display.trim() : target.trim(); |
|
||||||
return `WIKILINK:${dtag}|${displayText}`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Normalize dtag (lowercase, replace spaces with hyphens) |
|
||||||
*/ |
|
||||||
function normalizeDtag(dtag: string): string { |
|
||||||
return dtag.toLowerCase().replace(/\s+/g, '-'); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert hashtags #topic to hashtag:topic[topic] |
|
||||||
* Skip hashtags in URLs, code blocks, and inline code |
|
||||||
*/ |
|
||||||
function convertHashtags(content: string): string { |
|
||||||
// Protect code blocks
|
|
||||||
const codeBlocks: string[] = []; |
|
||||||
content = content.replace(/```[\s\S]*?```/g, (match) => { |
|
||||||
const placeholder = `__CODEBLOCK_${codeBlocks.length}__`; |
|
||||||
codeBlocks.push(match); |
|
||||||
return placeholder; |
|
||||||
}); |
|
||||||
|
|
||||||
// Protect inline code
|
|
||||||
const inlineCode: string[] = []; |
|
||||||
content = content.replace(/`[^`]+`/g, (match) => { |
|
||||||
const placeholder = `__INLINECODE_${inlineCode.length}__`; |
|
||||||
inlineCode.push(match); |
|
||||||
return placeholder; |
|
||||||
}); |
|
||||||
|
|
||||||
// Convert hashtags (not in URLs)
|
|
||||||
content = content.replace(/(?<!https?:\/\/[^\s]*)#([a-zA-Z0-9_]+)/g, (_match, topic) => { |
|
||||||
const normalized = topic.toLowerCase(); |
|
||||||
return `hashtag:${normalized}[#${topic}]`; |
|
||||||
}); |
|
||||||
|
|
||||||
// Restore inline code
|
|
||||||
inlineCode.forEach((code, index) => { |
|
||||||
content = content.replace(`__INLINECODE_${index}__`, code); |
|
||||||
}); |
|
||||||
|
|
||||||
// Restore code blocks
|
|
||||||
codeBlocks.forEach((block, index) => { |
|
||||||
content = content.replace(`__CODEBLOCK_${index}__`, block); |
|
||||||
}); |
|
||||||
|
|
||||||
return content; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert nostr: links to link:nostr:...[...] |
|
||||||
*/ |
|
||||||
function convertNostrLinks(content: string): string { |
|
||||||
// Match nostr:npub1..., nostr:note1..., etc.
|
|
||||||
return content.replace(/nostr:([a-z0-9]+[a-z0-9]{50,})/gi, (match, bech32Id) => { |
|
||||||
// Extract display text (first few chars)
|
|
||||||
const display = bech32Id.substring(0, 8) + '...'; |
|
||||||
return `link:nostr:${bech32Id}[${display}]`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Process media URLs and convert to MEDIA: placeholders |
|
||||||
*/ |
|
||||||
function processMediaUrls(content: string): string { |
|
||||||
let processed = content; |
|
||||||
|
|
||||||
// YouTube URLs
|
|
||||||
processed = processed.replace( |
|
||||||
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]+)/g, |
|
||||||
(_match, videoId) => `MEDIA:youtube:${videoId}` |
|
||||||
); |
|
||||||
|
|
||||||
// Spotify URLs
|
|
||||||
processed = processed.replace( |
|
||||||
/(?:https?:\/\/)?(?:open\.)?spotify\.com\/(track|album|playlist|artist|episode|show)\/([a-zA-Z0-9]+)/g, |
|
||||||
(_match, type, id) => `MEDIA:spotify:${type}:${id}` |
|
||||||
); |
|
||||||
|
|
||||||
// Video files
|
|
||||||
processed = processed.replace( |
|
||||||
/(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv))/gi, |
|
||||||
(_match, url) => `MEDIA:video:${url}` |
|
||||||
); |
|
||||||
|
|
||||||
// Audio files
|
|
||||||
processed = processed.replace( |
|
||||||
/(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp3|m4a|wav|flac|aac|opus|wma|ogg))/gi, |
|
||||||
(_match, url) => `MEDIA:audio:${url}` |
|
||||||
); |
|
||||||
|
|
||||||
return processed; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Process images (Markdown syntax and bare URLs) |
|
||||||
*/ |
|
||||||
function processImages(content: string, format: ContentFormat): string { |
|
||||||
let processed = content; |
|
||||||
|
|
||||||
// Markdown image syntax: 
|
|
||||||
processed = processed.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => { |
|
||||||
const cleanedUrl = cleanUrl(url); |
|
||||||
const cleanAlt = alt.trim(); |
|
||||||
return `image::${cleanedUrl}[${cleanAlt ? cleanAlt + ',' : ''}width=100%]`; |
|
||||||
}); |
|
||||||
|
|
||||||
// Bare image URLs (only if not already in a link or image tag)
|
|
||||||
if (format === ContentFormat.Markdown || format === ContentFormat.Plain) { |
|
||||||
const imageUrlPattern = /(?<!\]\()(?<!image::)(?<!link:)(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(jpeg|jpg|png|gif|webp|svg))/gi; |
|
||||||
processed = processed.replace(imageUrlPattern, (match, url) => { |
|
||||||
const cleanedUrl = cleanUrl(url); |
|
||||||
return `image::${cleanedUrl}[width=100%]`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
return processed; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Process links (Markdown syntax and bare URLs) |
|
||||||
*/ |
|
||||||
function processLinks(content: string, format: ContentFormat): string { |
|
||||||
let processed = content; |
|
||||||
|
|
||||||
// Markdown link syntax: [text](url)
|
|
||||||
processed = processed.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => { |
|
||||||
// Skip if this is already processed as an image
|
|
||||||
if (text.startsWith('!')) { |
|
||||||
return _match; |
|
||||||
} |
|
||||||
const cleanedUrl = cleanUrl(url); |
|
||||||
return `link:${cleanedUrl}[${text}]`; |
|
||||||
}); |
|
||||||
|
|
||||||
// Bare URLs (only for Markdown and Plain formats)
|
|
||||||
if (format === ContentFormat.Markdown || format === ContentFormat.Plain) { |
|
||||||
processed = processBareUrls(processed); |
|
||||||
} |
|
||||||
|
|
||||||
return processed; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Process bare URLs and convert to link: macros |
|
||||||
* Handles http://, https://, www., and wss:// URLs
|
|
||||||
*/ |
|
||||||
function processBareUrls(content: string): string { |
|
||||||
// URL pattern: matches http://, https://, www., and wss://
|
|
||||||
// Negative lookbehind to avoid matching URLs after ":" (e.g., "hyperlink: www.example.com")
|
|
||||||
const urlPattern = /(?<!:\s)(?<!\]\()\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+|wss:\/\/[^\s<>"{}|\\^`\[\]()]+|www\.[^\s<>"{}|\\^`\[\]()]+)/gi; |
|
||||||
|
|
||||||
return content.replace(urlPattern, (match, url) => { |
|
||||||
// Skip if already in a link or image macro
|
|
||||||
if (match.includes('link:') || match.includes('image::')) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
let fullUrl = url; |
|
||||||
let displayText = url; |
|
||||||
|
|
||||||
// Handle www. URLs
|
|
||||||
if (url.startsWith('www.')) { |
|
||||||
fullUrl = 'https://' + url; |
|
||||||
displayText = url; |
|
||||||
} |
|
||||||
// Handle wss:// URLs - convert to https:// for the link, but keep wss:// in display
|
|
||||||
else if (url.startsWith('wss://')) { |
|
||||||
fullUrl = url.replace(/^wss:\/\//, 'https://'); |
|
||||||
displayText = url; // Keep wss:// in display text
|
|
||||||
} |
|
||||||
|
|
||||||
// Clean the URL (remove tracking parameters)
|
|
||||||
fullUrl = cleanUrl(fullUrl); |
|
||||||
|
|
||||||
// Create AsciiDoc link macro
|
|
||||||
return `link:${fullUrl}[${displayText}]`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Clean URL by removing tracking parameters |
|
||||||
*/ |
|
||||||
function cleanUrl(url: string): string { |
|
||||||
try { |
|
||||||
const parsedUrl = new URL(url); |
|
||||||
|
|
||||||
// List of tracking parameters to remove
|
|
||||||
const trackingParams = [ |
|
||||||
// Google Analytics & Ads
|
|
||||||
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', |
|
||||||
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic', |
|
||||||
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid', |
|
||||||
|
|
||||||
// Facebook
|
|
||||||
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref', |
|
||||||
|
|
||||||
// Twitter/X
|
|
||||||
'twclid', 'twsrc', |
|
||||||
|
|
||||||
// Microsoft/Bing
|
|
||||||
'msclkid', 'mc_cid', 'mc_eid', |
|
||||||
|
|
||||||
// Adobe
|
|
||||||
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid', |
|
||||||
|
|
||||||
// Mailchimp
|
|
||||||
'mc_cid', 'mc_eid', |
|
||||||
|
|
||||||
// HubSpot
|
|
||||||
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver', |
|
||||||
|
|
||||||
// Marketo
|
|
||||||
'mkt_tok', |
|
||||||
|
|
||||||
// YouTube
|
|
||||||
'si', 'feature', 'kw', 'pp', |
|
||||||
|
|
||||||
// Other common tracking
|
|
||||||
'ref', 'referrer', 'source', 'campaign', 'medium', 'content', |
|
||||||
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd', |
|
||||||
|
|
||||||
// Mobile app tracking
|
|
||||||
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative', |
|
||||||
|
|
||||||
// Amazon
|
|
||||||
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag', |
|
||||||
|
|
||||||
// Affiliate tracking
|
|
||||||
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer', |
|
||||||
|
|
||||||
// Social media share tracking
|
|
||||||
'share', 'shared', 'sharesource' |
|
||||||
]; |
|
||||||
|
|
||||||
// Remove all tracking parameters
|
|
||||||
trackingParams.forEach(param => { |
|
||||||
parsedUrl.searchParams.delete(param); |
|
||||||
}); |
|
||||||
|
|
||||||
// Remove any parameter that starts with utm_ or _
|
|
||||||
Array.from(parsedUrl.searchParams.keys()).forEach(key => { |
|
||||||
if (key.startsWith('utm_') || key.startsWith('_')) { |
|
||||||
parsedUrl.searchParams.delete(key); |
|
||||||
} |
|
||||||
}); |
|
||||||
|
|
||||||
return parsedUrl.toString(); |
|
||||||
} catch { |
|
||||||
// If URL parsing fails, return original URL
|
|
||||||
return url; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert Markdown-specific syntax to AsciiDoc |
|
||||||
*/ |
|
||||||
function convertMarkdownToAsciidoc(content: string): string { |
|
||||||
// Most Markdown syntax is handled by AsciiDoctor's markdown support
|
|
||||||
// This function can be extended for additional conversions if needed
|
|
||||||
return content; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert Wikipedia-specific syntax to AsciiDoc |
|
||||||
*/ |
|
||||||
function convertWikipediaToAsciidoc(content: string): string { |
|
||||||
// Wikipedia-specific conversions can be added here
|
|
||||||
return content; |
|
||||||
} |
|
||||||
@ -1,70 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.detectFormat = detectFormat; |
|
||||||
const types_1 = require("./types"); |
|
||||||
/** |
|
||||||
* Detects the content format based on content patterns |
|
||||||
*/ |
|
||||||
function detectFormat(content) { |
|
||||||
// Check for AsciiDoc indicators
|
|
||||||
const asciidocIndicators = [ |
|
||||||
'= ', // Title
|
|
||||||
'== ', // Section
|
|
||||||
'=== ', // Subsection
|
|
||||||
'include::', // Include directive
|
|
||||||
'image::', // Image block
|
|
||||||
'[source', // Source block
|
|
||||||
'----', // Listing block
|
|
||||||
'....', // Literal block
|
|
||||||
'|===', // Table
|
|
||||||
'link:', // AsciiDoc link format
|
|
||||||
'wikilink:', // Wikilink macro
|
|
||||||
'hashtag:', // Hashtag macro
|
|
||||||
]; |
|
||||||
let asciidocScore = 0; |
|
||||||
for (const indicator of asciidocIndicators) { |
|
||||||
if (content.includes(indicator)) { |
|
||||||
asciidocScore++; |
|
||||||
} |
|
||||||
} |
|
||||||
// Check for Wikipedia markup indicators (== Heading == format)
|
|
||||||
const wikipediaIndicators = [ |
|
||||||
/^==+\s+.+?\s+==+$/m, // Wikipedia headings: == Heading ==
|
|
||||||
/\[\[[^\]]+\]\]/, // Wikipedia links: [[Page]]
|
|
||||||
/''[^']+''/, // Wikipedia bold: ''text''
|
|
||||||
/'[^']+'/, // Wikipedia italic: 'text'
|
|
||||||
]; |
|
||||||
let wikipediaScore = 0; |
|
||||||
for (const indicator of wikipediaIndicators) { |
|
||||||
if (indicator.test(content)) { |
|
||||||
wikipediaScore++; |
|
||||||
} |
|
||||||
} |
|
||||||
// Check for Markdown indicators (more specific patterns to avoid false positives)
|
|
||||||
const markdownIndicators = [ |
|
||||||
/^#{1,6}\s+/m, // Heading at start of line
|
|
||||||
/```[\s\S]*?```/, // Code block
|
|
||||||
/\*\*[^*]+\*\*/, // Bold text
|
|
||||||
/^[-*+]\s+/m, // List item at start of line
|
|
||||||
/!\[[^\]]*\]\([^)]+\)/, // Image syntax
|
|
||||||
/\[[^\]]+\]\([^)]+\)/, // Link syntax
|
|
||||||
]; |
|
||||||
let markdownScore = 0; |
|
||||||
for (const indicator of markdownIndicators) { |
|
||||||
if (indicator.test(content)) { |
|
||||||
markdownScore++; |
|
||||||
} |
|
||||||
} |
|
||||||
// Determine format based on scores
|
|
||||||
// Wikipedia format takes precedence if detected (it's more specific)
|
|
||||||
if (wikipediaScore > 0 && wikipediaScore >= 2) { |
|
||||||
return types_1.ContentFormat.Wikipedia; |
|
||||||
} |
|
||||||
else if (asciidocScore > markdownScore && asciidocScore >= 2) { |
|
||||||
return types_1.ContentFormat.AsciiDoc; |
|
||||||
} |
|
||||||
else if (markdownScore > 0) { |
|
||||||
return types_1.ContentFormat.Markdown; |
|
||||||
} |
|
||||||
return types_1.ContentFormat.Plain; |
|
||||||
} |
|
||||||
@ -1,160 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.extractFrontmatter = extractFrontmatter; |
|
||||||
/** |
|
||||||
* Extracts front matter from content |
|
||||||
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value) |
|
||||||
* Returns the front matter object and the content |
|
||||||
* For YAML: removes front matter from content |
|
||||||
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output) |
|
||||||
*/ |
|
||||||
function extractFrontmatter(content) { |
|
||||||
// First, try to match YAML front matter: ---\n...\n---
|
|
||||||
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/; |
|
||||||
const yamlMatch = content.match(yamlFrontmatterRegex); |
|
||||||
if (yamlMatch) { |
|
||||||
const yamlContent = yamlMatch[1]; |
|
||||||
const contentWithoutFrontmatter = yamlMatch[2]; |
|
||||||
// Simple YAML parser for basic key-value pairs and arrays
|
|
||||||
// This is a basic implementation - for complex YAML, consider using a library
|
|
||||||
const frontmatter = {}; |
|
||||||
const lines = yamlContent.split('\n'); |
|
||||||
let currentKey = null; |
|
||||||
let inArray = false; |
|
||||||
let arrayKey = null; |
|
||||||
for (let i = 0; i < lines.length; i++) { |
|
||||||
const line = lines[i]; |
|
||||||
const trimmed = line.trim(); |
|
||||||
// Skip empty lines and comments
|
|
||||||
if (!trimmed || trimmed.startsWith('#')) { |
|
||||||
if (inArray && trimmed === '') { |
|
||||||
// Empty line might end the array
|
|
||||||
inArray = false; |
|
||||||
arrayKey = null; |
|
||||||
} |
|
||||||
continue; |
|
||||||
} |
|
||||||
// Array item (line starting with -)
|
|
||||||
if (trimmed.startsWith('- ')) { |
|
||||||
const item = trimmed.substring(2).trim(); |
|
||||||
const cleanItem = item.replace(/^["']|["']$/g, ''); |
|
||||||
if (arrayKey && frontmatter[arrayKey]) { |
|
||||||
frontmatter[arrayKey].push(cleanItem); |
|
||||||
} |
|
||||||
else if (currentKey) { |
|
||||||
// Start new array
|
|
||||||
arrayKey = currentKey; |
|
||||||
inArray = true; |
|
||||||
frontmatter[currentKey] = [cleanItem]; |
|
||||||
} |
|
||||||
continue; |
|
||||||
} |
|
||||||
// Key-value pair
|
|
||||||
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/); |
|
||||||
if (keyValueMatch) { |
|
||||||
const key = keyValueMatch[1]; |
|
||||||
let value = keyValueMatch[2].trim(); |
|
||||||
// Remove quotes if present
|
|
||||||
if ((value.startsWith('"') && value.endsWith('"')) || |
|
||||||
(value.startsWith("'") && value.endsWith("'"))) { |
|
||||||
value = value.slice(1, -1); |
|
||||||
} |
|
||||||
frontmatter[key] = value; |
|
||||||
currentKey = key; |
|
||||||
inArray = false; |
|
||||||
arrayKey = null; |
|
||||||
continue; |
|
||||||
} |
|
||||||
} |
|
||||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter }; |
|
||||||
} |
|
||||||
// If no YAML front matter, try to extract AsciiDoc document header attributes
|
|
||||||
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
|
|
||||||
// Match header lines until we hit a blank line (which separates header from body)
|
|
||||||
// The header consists of: title line, optional author/revision lines, and attribute lines
|
|
||||||
const lines = content.split('\n'); |
|
||||||
let headerEndIndex = 0; |
|
||||||
// Find where the header ends (first blank line after title/attributes)
|
|
||||||
if (lines[0] && lines[0].match(/^=+\s+/)) { |
|
||||||
// We have a title line, now find where header ends
|
|
||||||
let i = 1; |
|
||||||
// Skip author and revision lines (non-empty lines that don't start with :)
|
|
||||||
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) { |
|
||||||
i++; |
|
||||||
} |
|
||||||
// Now skip attribute lines (lines starting with :)
|
|
||||||
while (i < lines.length && lines[i].trim().startsWith(':')) { |
|
||||||
i++; |
|
||||||
} |
|
||||||
// Skip the blank line that separates header from body
|
|
||||||
if (i < lines.length && lines[i].trim() === '') { |
|
||||||
i++; |
|
||||||
} |
|
||||||
headerEndIndex = i; |
|
||||||
} |
|
||||||
// If we found a header, extract it
|
|
||||||
if (headerEndIndex > 0) { |
|
||||||
const headerLines = lines.slice(0, headerEndIndex); |
|
||||||
const headerContent = headerLines.join('\n'); |
|
||||||
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n'); |
|
||||||
const frontmatter = {}; |
|
||||||
const headerLinesArray = headerContent.split('\n'); |
|
||||||
// Extract title (first line starting with =)
|
|
||||||
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/); |
|
||||||
if (titleMatch) { |
|
||||||
frontmatter.title = titleMatch[1].trim(); |
|
||||||
} |
|
||||||
// Extract author (line after title, if it doesn't start with :)
|
|
||||||
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) { |
|
||||||
const authorLine = headerLinesArray[1].trim(); |
|
||||||
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) { |
|
||||||
// Not a revision line (which has numbers, commas, colons)
|
|
||||||
frontmatter.author = authorLine; |
|
||||||
} |
|
||||||
} |
|
||||||
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
|
|
||||||
for (let i = 1; i < headerLinesArray.length; i++) { |
|
||||||
const line = headerLinesArray[i].trim(); |
|
||||||
if (line.match(/^[\d.,\s:]+$/)) { |
|
||||||
// This looks like a revision line
|
|
||||||
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/); |
|
||||||
if (revisionMatch) { |
|
||||||
frontmatter.version = revisionMatch[1].trim(); |
|
||||||
frontmatter.date = revisionMatch[2].trim(); |
|
||||||
if (revisionMatch[3]) { |
|
||||||
frontmatter.revision = revisionMatch[3].trim(); |
|
||||||
} |
|
||||||
} |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
// Extract AsciiDoc attributes (:key: value)
|
|
||||||
for (const line of headerLinesArray) { |
|
||||||
const trimmed = line.trim(); |
|
||||||
if (trimmed.startsWith(':') && trimmed.includes(':')) { |
|
||||||
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/); |
|
||||||
if (attrMatch) { |
|
||||||
const key = attrMatch[1].trim(); |
|
||||||
let value = attrMatch[2].trim(); |
|
||||||
// Remove quotes if present
|
|
||||||
if ((value.startsWith('"') && value.endsWith('"')) || |
|
||||||
(value.startsWith("'") && value.endsWith("'"))) { |
|
||||||
value = value.slice(1, -1); |
|
||||||
} |
|
||||||
// Handle comma-separated values (like keywords)
|
|
||||||
if (value.includes(',') && !value.includes(' ')) { |
|
||||||
frontmatter[key] = value.split(',').map((v) => v.trim()); |
|
||||||
} |
|
||||||
else { |
|
||||||
frontmatter[key] = value; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
|
|
||||||
// AsciiDoctor can work without the header, and we've already extracted the metadata
|
|
||||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader }; |
|
||||||
} |
|
||||||
// No front matter found
|
|
||||||
return { content }; |
|
||||||
} |
|
||||||
@ -1,177 +0,0 @@ |
|||||||
/** |
|
||||||
* Extracts front matter from content |
|
||||||
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value) |
|
||||||
* Returns the front matter object and the content |
|
||||||
* For YAML: removes front matter from content |
|
||||||
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output) |
|
||||||
*/ |
|
||||||
export function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } { |
|
||||||
// First, try to match YAML front matter: ---\n...\n---
|
|
||||||
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/; |
|
||||||
const yamlMatch = content.match(yamlFrontmatterRegex); |
|
||||||
|
|
||||||
if (yamlMatch) { |
|
||||||
const yamlContent = yamlMatch[1]; |
|
||||||
const contentWithoutFrontmatter = yamlMatch[2]; |
|
||||||
|
|
||||||
// Simple YAML parser for basic key-value pairs and arrays
|
|
||||||
// This is a basic implementation - for complex YAML, consider using a library
|
|
||||||
const frontmatter: Record<string, any> = {}; |
|
||||||
const lines = yamlContent.split('\n'); |
|
||||||
let currentKey: string | null = null; |
|
||||||
let inArray = false; |
|
||||||
let arrayKey: string | null = null; |
|
||||||
|
|
||||||
for (let i = 0; i < lines.length; i++) { |
|
||||||
const line = lines[i]; |
|
||||||
const trimmed = line.trim(); |
|
||||||
|
|
||||||
// Skip empty lines and comments
|
|
||||||
if (!trimmed || trimmed.startsWith('#')) { |
|
||||||
if (inArray && trimmed === '') { |
|
||||||
// Empty line might end the array
|
|
||||||
inArray = false; |
|
||||||
arrayKey = null; |
|
||||||
} |
|
||||||
continue; |
|
||||||
} |
|
||||||
|
|
||||||
// Array item (line starting with -)
|
|
||||||
if (trimmed.startsWith('- ')) { |
|
||||||
const item = trimmed.substring(2).trim(); |
|
||||||
const cleanItem = item.replace(/^["']|["']$/g, ''); |
|
||||||
|
|
||||||
if (arrayKey && frontmatter[arrayKey]) { |
|
||||||
frontmatter[arrayKey].push(cleanItem); |
|
||||||
} else if (currentKey) { |
|
||||||
// Start new array
|
|
||||||
arrayKey = currentKey; |
|
||||||
inArray = true; |
|
||||||
frontmatter[currentKey] = [cleanItem]; |
|
||||||
} |
|
||||||
continue; |
|
||||||
} |
|
||||||
|
|
||||||
// Key-value pair
|
|
||||||
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/); |
|
||||||
if (keyValueMatch) { |
|
||||||
const key = keyValueMatch[1]; |
|
||||||
let value = keyValueMatch[2].trim(); |
|
||||||
|
|
||||||
// Remove quotes if present
|
|
||||||
if ((value.startsWith('"') && value.endsWith('"')) ||
|
|
||||||
(value.startsWith("'") && value.endsWith("'"))) { |
|
||||||
value = value.slice(1, -1); |
|
||||||
} |
|
||||||
|
|
||||||
frontmatter[key] = value; |
|
||||||
currentKey = key; |
|
||||||
inArray = false; |
|
||||||
arrayKey = null; |
|
||||||
continue; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter }; |
|
||||||
} |
|
||||||
|
|
||||||
// If no YAML front matter, try to extract AsciiDoc document header attributes
|
|
||||||
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
|
|
||||||
// Match header lines until we hit a blank line (which separates header from body)
|
|
||||||
// The header consists of: title line, optional author/revision lines, and attribute lines
|
|
||||||
const lines = content.split('\n'); |
|
||||||
let headerEndIndex = 0; |
|
||||||
|
|
||||||
// Find where the header ends (first blank line after title/attributes)
|
|
||||||
if (lines[0] && lines[0].match(/^=+\s+/)) { |
|
||||||
// We have a title line, now find where header ends
|
|
||||||
let i = 1; |
|
||||||
// Skip author and revision lines (non-empty lines that don't start with :)
|
|
||||||
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) { |
|
||||||
i++; |
|
||||||
} |
|
||||||
// Now skip attribute lines (lines starting with :)
|
|
||||||
while (i < lines.length && lines[i].trim().startsWith(':')) { |
|
||||||
i++; |
|
||||||
} |
|
||||||
// Skip the blank line that separates header from body
|
|
||||||
if (i < lines.length && lines[i].trim() === '') { |
|
||||||
i++; |
|
||||||
} |
|
||||||
headerEndIndex = i; |
|
||||||
} |
|
||||||
|
|
||||||
// If we found a header, extract it
|
|
||||||
if (headerEndIndex > 0) { |
|
||||||
const headerLines = lines.slice(0, headerEndIndex); |
|
||||||
const headerContent = headerLines.join('\n'); |
|
||||||
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n'); |
|
||||||
|
|
||||||
const frontmatter: Record<string, any> = {}; |
|
||||||
const headerLinesArray = headerContent.split('\n'); |
|
||||||
|
|
||||||
// Extract title (first line starting with =)
|
|
||||||
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/); |
|
||||||
if (titleMatch) { |
|
||||||
frontmatter.title = titleMatch[1].trim(); |
|
||||||
} |
|
||||||
|
|
||||||
// Extract author (line after title, if it doesn't start with :)
|
|
||||||
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) { |
|
||||||
const authorLine = headerLinesArray[1].trim(); |
|
||||||
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) { |
|
||||||
// Not a revision line (which has numbers, commas, colons)
|
|
||||||
frontmatter.author = authorLine; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
|
|
||||||
for (let i = 1; i < headerLinesArray.length; i++) { |
|
||||||
const line = headerLinesArray[i].trim(); |
|
||||||
if (line.match(/^[\d.,\s:]+$/)) { |
|
||||||
// This looks like a revision line
|
|
||||||
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/); |
|
||||||
if (revisionMatch) { |
|
||||||
frontmatter.version = revisionMatch[1].trim(); |
|
||||||
frontmatter.date = revisionMatch[2].trim(); |
|
||||||
if (revisionMatch[3]) { |
|
||||||
frontmatter.revision = revisionMatch[3].trim(); |
|
||||||
} |
|
||||||
} |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Extract AsciiDoc attributes (:key: value)
|
|
||||||
for (const line of headerLinesArray) { |
|
||||||
const trimmed = line.trim(); |
|
||||||
if (trimmed.startsWith(':') && trimmed.includes(':')) { |
|
||||||
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/); |
|
||||||
if (attrMatch) { |
|
||||||
const key = attrMatch[1].trim(); |
|
||||||
let value = attrMatch[2].trim(); |
|
||||||
|
|
||||||
// Remove quotes if present
|
|
||||||
if ((value.startsWith('"') && value.endsWith('"')) ||
|
|
||||||
(value.startsWith("'") && value.endsWith("'"))) { |
|
||||||
value = value.slice(1, -1); |
|
||||||
} |
|
||||||
|
|
||||||
// Handle comma-separated values (like keywords)
|
|
||||||
if (value.includes(',') && !value.includes(' ')) { |
|
||||||
frontmatter[key] = value.split(',').map((v: string) => v.trim()); |
|
||||||
} else { |
|
||||||
frontmatter[key] = value; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
|
|
||||||
// AsciiDoctor can work without the header, and we've already extracted the metadata
|
|
||||||
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader }; |
|
||||||
} |
|
||||||
|
|
||||||
// No front matter found
|
|
||||||
return { content }; |
|
||||||
} |
|
||||||
@ -1,243 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.extractMetadata = extractMetadata; |
|
||||||
/** |
|
||||||
* Extracts metadata from content before processing |
|
||||||
*/ |
|
||||||
function extractMetadata(content, linkBaseURL) { |
|
||||||
return { |
|
||||||
nostrLinks: extractNostrLinks(content), |
|
||||||
wikilinks: extractWikilinks(content), |
|
||||||
hashtags: extractHashtags(content), |
|
||||||
links: extractLinks(content, linkBaseURL), |
|
||||||
media: extractMedia(content), |
|
||||||
}; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Extract Nostr links from content |
|
||||||
*/ |
|
||||||
function extractNostrLinks(content) { |
|
||||||
const nostrLinks = []; |
|
||||||
const seen = new Set(); |
|
||||||
// Extract nostr: prefixed links (valid bech32 format)
|
|
||||||
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || []; |
|
||||||
nostrMatches.forEach(match => { |
|
||||||
const id = match.substring(6); // Remove 'nostr:'
|
|
||||||
const type = getNostrType(id); |
|
||||||
if (type && !seen.has(id)) { |
|
||||||
seen.add(id); |
|
||||||
nostrLinks.push({ |
|
||||||
type, |
|
||||||
id, |
|
||||||
text: match, |
|
||||||
bech32: id, |
|
||||||
}); |
|
||||||
} |
|
||||||
}); |
|
||||||
return nostrLinks; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Extract wikilinks from content |
|
||||||
*/ |
|
||||||
function extractWikilinks(content) { |
|
||||||
const wikilinks = []; |
|
||||||
const seen = new Set(); |
|
||||||
// Match [[target]] or [[target|display]]
|
|
||||||
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g; |
|
||||||
let match; |
|
||||||
while ((match = wikilinkPattern.exec(content)) !== null) { |
|
||||||
const target = match[1].trim(); |
|
||||||
const display = match[2] ? match[2].trim() : target; |
|
||||||
const dtag = normalizeDtag(target); |
|
||||||
const key = `${dtag}|${display}`; |
|
||||||
if (!seen.has(key)) { |
|
||||||
seen.add(key); |
|
||||||
wikilinks.push({ |
|
||||||
dtag, |
|
||||||
display, |
|
||||||
original: match[0], |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
|
||||||
return wikilinks; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Extract hashtags from content |
|
||||||
* Excludes hashtags in URLs, code blocks, and inline code |
|
||||||
*/ |
|
||||||
function extractHashtags(content) { |
|
||||||
const hashtags = []; |
|
||||||
const seen = new Set(); |
|
||||||
// Remove code blocks first to avoid matching inside them
|
|
||||||
const codeBlockPattern = /```[\s\S]*?```/g; |
|
||||||
const inlineCodePattern = /`[^`]+`/g; |
|
||||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
|
||||||
let processedContent = content |
|
||||||
.replace(codeBlockPattern, '') // Remove code blocks
|
|
||||||
.replace(inlineCodePattern, '') // Remove inline code
|
|
||||||
.replace(urlPattern, ''); // Remove URLs
|
|
||||||
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
|
|
||||||
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g; |
|
||||||
let match; |
|
||||||
while ((match = hashtagPattern.exec(processedContent)) !== null) { |
|
||||||
const tag = match[1].toLowerCase(); |
|
||||||
if (!seen.has(tag)) { |
|
||||||
hashtags.push(tag); |
|
||||||
seen.add(tag); |
|
||||||
} |
|
||||||
} |
|
||||||
return hashtags; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Extract regular links from content |
|
||||||
*/ |
|
||||||
function extractLinks(content, linkBaseURL) { |
|
||||||
const links = []; |
|
||||||
const seen = new Set(); |
|
||||||
// Extract markdown links: [text](url) - optimized to avoid double matching
|
|
||||||
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g; |
|
||||||
let markdownMatch; |
|
||||||
while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) { |
|
||||||
const [, text, url] = markdownMatch; |
|
||||||
if (!seen.has(url) && !isNostrUrl(url)) { |
|
||||||
seen.add(url); |
|
||||||
links.push({ |
|
||||||
url, |
|
||||||
text, |
|
||||||
isExternal: isExternalUrl(url, linkBaseURL), |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
|
||||||
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
|
|
||||||
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g; |
|
||||||
let asciidocMatch; |
|
||||||
while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) { |
|
||||||
const [, url, text] = asciidocMatch; |
|
||||||
if (!seen.has(url) && !isNostrUrl(url)) { |
|
||||||
seen.add(url); |
|
||||||
links.push({ |
|
||||||
url, |
|
||||||
text, |
|
||||||
isExternal: isExternalUrl(url, linkBaseURL), |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
|
||||||
// Extract raw URLs (basic pattern)
|
|
||||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
|
||||||
const rawUrls = content.match(urlPattern) || []; |
|
||||||
rawUrls.forEach(url => { |
|
||||||
if (!seen.has(url) && !isNostrUrl(url)) { |
|
||||||
seen.add(url); |
|
||||||
links.push({ |
|
||||||
url, |
|
||||||
text: url, |
|
||||||
isExternal: isExternalUrl(url, linkBaseURL), |
|
||||||
}); |
|
||||||
} |
|
||||||
}); |
|
||||||
return links; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Extract media URLs from content |
|
||||||
*/ |
|
||||||
function extractMedia(content) { |
|
||||||
const media = []; |
|
||||||
const seen = new Set(); |
|
||||||
// Extract markdown images:  - optimized to avoid double matching
|
|
||||||
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g; |
|
||||||
let markdownImageMatch; |
|
||||||
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) { |
|
||||||
const url = markdownImageMatch[1]; |
|
||||||
if (url && !seen.has(url)) { |
|
||||||
if (isImageUrl(url) || isVideoUrl(url)) { |
|
||||||
media.push(url); |
|
||||||
seen.add(url); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
|
|
||||||
const asciidocImagePattern = /image::([^\[]+)\[/g; |
|
||||||
let asciidocImageMatch; |
|
||||||
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) { |
|
||||||
const url = asciidocImageMatch[1]; |
|
||||||
if (url && !seen.has(url)) { |
|
||||||
if (isImageUrl(url) || isVideoUrl(url)) { |
|
||||||
media.push(url); |
|
||||||
seen.add(url); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
// Extract raw image/video URLs
|
|
||||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
|
||||||
const rawUrls = content.match(urlPattern) || []; |
|
||||||
rawUrls.forEach(url => { |
|
||||||
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) { |
|
||||||
media.push(url); |
|
||||||
seen.add(url); |
|
||||||
} |
|
||||||
}); |
|
||||||
return media; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Get Nostr identifier type |
|
||||||
*/ |
|
||||||
function getNostrType(id) { |
|
||||||
if (id.startsWith('npub')) |
|
||||||
return 'npub'; |
|
||||||
if (id.startsWith('nprofile')) |
|
||||||
return 'nprofile'; |
|
||||||
if (id.startsWith('nevent')) |
|
||||||
return 'nevent'; |
|
||||||
if (id.startsWith('naddr')) |
|
||||||
return 'naddr'; |
|
||||||
if (id.startsWith('note')) |
|
||||||
return 'note'; |
|
||||||
return null; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Normalize text to d-tag format |
|
||||||
*/ |
|
||||||
function normalizeDtag(text) { |
|
||||||
return text |
|
||||||
.toLowerCase() |
|
||||||
.replace(/[^a-z0-9]+/g, '-') |
|
||||||
.replace(/^-+|-+$/g, ''); |
|
||||||
} |
|
||||||
/** |
|
||||||
* Check if URL is external |
|
||||||
*/ |
|
||||||
function isExternalUrl(url, linkBaseURL) { |
|
||||||
if (!linkBaseURL) |
|
||||||
return true; |
|
||||||
try { |
|
||||||
// Use a simple string-based check for Node.js compatibility
|
|
||||||
// Extract hostname from URL string
|
|
||||||
const urlMatch = url.match(/^https?:\/\/([^\/]+)/); |
|
||||||
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
|
||||||
if (urlMatch && baseMatch) { |
|
||||||
return urlMatch[1] !== baseMatch[1]; |
|
||||||
} |
|
||||||
return true; |
|
||||||
} |
|
||||||
catch { |
|
||||||
return true; |
|
||||||
} |
|
||||||
} |
|
||||||
/** |
|
||||||
* Check if URL is a Nostr URL |
|
||||||
*/ |
|
||||||
function isNostrUrl(url) { |
|
||||||
return url.startsWith('nostr:') || getNostrType(url) !== null; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Check if URL is an image |
|
||||||
*/ |
|
||||||
function isImageUrl(url) { |
|
||||||
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url); |
|
||||||
} |
|
||||||
/** |
|
||||||
* Check if URL is a video |
|
||||||
*/ |
|
||||||
function isVideoUrl(url) { |
|
||||||
return /\.(mp4|webm|ogg)$/i.test(url); |
|
||||||
} |
|
||||||
@ -1,396 +0,0 @@ |
|||||||
import { NostrLink, Wikilink } from '../types'; |
|
||||||
|
|
||||||
export interface ExtractedMetadata { |
|
||||||
nostrLinks: NostrLink[]; |
|
||||||
wikilinks: Wikilink[]; |
|
||||||
hashtags: string[]; |
|
||||||
links: Array<{ url: string; text: string; isExternal: boolean }>; |
|
||||||
media: string[]; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Extracts metadata from content before processing |
|
||||||
*/ |
|
||||||
export function extractMetadata(content: string, linkBaseURL: string): ExtractedMetadata { |
|
||||||
return { |
|
||||||
nostrLinks: extractNostrLinks(content), |
|
||||||
wikilinks: extractWikilinks(content), |
|
||||||
hashtags: extractHashtags(content), |
|
||||||
links: extractLinks(content, linkBaseURL), |
|
||||||
media: extractMedia(content), |
|
||||||
}; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Extract Nostr links from content |
|
||||||
*/ |
|
||||||
function extractNostrLinks(content: string): NostrLink[] { |
|
||||||
const nostrLinks: NostrLink[] = []; |
|
||||||
const seen = new Set<string>(); |
|
||||||
|
|
||||||
// Extract nostr: prefixed links (valid bech32 format)
|
|
||||||
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || []; |
|
||||||
nostrMatches.forEach(match => { |
|
||||||
const id = match.substring(6); // Remove 'nostr:'
|
|
||||||
const type = getNostrType(id); |
|
||||||
if (type && !seen.has(id)) { |
|
||||||
seen.add(id); |
|
||||||
nostrLinks.push({ |
|
||||||
type, |
|
||||||
id, |
|
||||||
text: match, |
|
||||||
bech32: id, |
|
||||||
}); |
|
||||||
} |
|
||||||
}); |
|
||||||
|
|
||||||
return nostrLinks; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Extract wikilinks from content |
|
||||||
*/ |
|
||||||
function extractWikilinks(content: string): Wikilink[] { |
|
||||||
const wikilinks: Wikilink[] = []; |
|
||||||
const seen = new Set<string>(); |
|
||||||
|
|
||||||
// Match [[target]] or [[target|display]]
|
|
||||||
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g; |
|
||||||
let match; |
|
||||||
|
|
||||||
while ((match = wikilinkPattern.exec(content)) !== null) { |
|
||||||
const target = match[1].trim(); |
|
||||||
const display = match[2] ? match[2].trim() : target; |
|
||||||
const dtag = normalizeDtag(target); |
|
||||||
const key = `${dtag}|${display}`; |
|
||||||
|
|
||||||
if (!seen.has(key)) { |
|
||||||
seen.add(key); |
|
||||||
wikilinks.push({ |
|
||||||
dtag, |
|
||||||
display, |
|
||||||
original: match[0], |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return wikilinks; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Extract hashtags from content |
|
||||||
* Excludes hashtags in URLs, code blocks, and inline code |
|
||||||
*/ |
|
||||||
function extractHashtags(content: string): string[] { |
|
||||||
const hashtags: string[] = []; |
|
||||||
const seen = new Set<string>(); |
|
||||||
|
|
||||||
// Remove code blocks first to avoid matching inside them
|
|
||||||
const codeBlockPattern = /```[\s\S]*?```/g; |
|
||||||
const inlineCodePattern = /`[^`]+`/g; |
|
||||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
|
||||||
|
|
||||||
let processedContent = content |
|
||||||
.replace(codeBlockPattern, '') // Remove code blocks
|
|
||||||
.replace(inlineCodePattern, '') // Remove inline code
|
|
||||||
.replace(urlPattern, ''); // Remove URLs
|
|
||||||
|
|
||||||
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
|
|
||||||
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g; |
|
||||||
let match; |
|
||||||
|
|
||||||
while ((match = hashtagPattern.exec(processedContent)) !== null) { |
|
||||||
const tag = match[1].toLowerCase(); |
|
||||||
if (!seen.has(tag)) { |
|
||||||
hashtags.push(tag); |
|
||||||
seen.add(tag); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return hashtags; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Extract regular links from content |
|
||||||
*/ |
|
||||||
function extractLinks(content: string, linkBaseURL: string): Array<{ url: string; text: string; isExternal: boolean }> { |
|
||||||
const links: Array<{ url: string; text: string; isExternal: boolean }> = []; |
|
||||||
const seen = new Set<string>(); |
|
||||||
|
|
||||||
// Remove code blocks and inline code to avoid matching URLs inside them
|
|
||||||
const codeBlockPattern = /```[\s\S]*?```/g; |
|
||||||
const inlineCodePattern = /`[^`]+`/g; |
|
||||||
let processedContent = content |
|
||||||
.replace(codeBlockPattern, '') // Remove code blocks
|
|
||||||
.replace(inlineCodePattern, ''); // Remove inline code
|
|
||||||
|
|
||||||
// Extract markdown links: [text](url) - but NOT images 
|
|
||||||
// First, extract nested image links: [](link-url)
|
|
||||||
// These should extract the outer link with the alt text
|
|
||||||
// We also need to mark the inner image URL as seen so it doesn't get extracted as a raw URL
|
|
||||||
const nestedImageLinkPattern = /\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)/g; |
|
||||||
let nestedMatch; |
|
||||||
const nestedImageUrls = new Set<string>(); // Track inner image URLs to exclude them
|
|
||||||
while ((nestedMatch = nestedImageLinkPattern.exec(processedContent)) !== null) { |
|
||||||
const [, altText, imageUrl, linkUrl] = nestedMatch; |
|
||||||
const cleanLinkUrl = linkUrl.trim().replace(/[)\].,;:!?`]+$/, ''); |
|
||||||
const cleanImageUrl = imageUrl.trim().replace(/[)\].,;:!?`]+$/, ''); |
|
||||||
|
|
||||||
// Mark the inner image URL as seen so it doesn't get extracted as a raw URL
|
|
||||||
nestedImageUrls.add(cleanImageUrl); |
|
||||||
// Also mark it in the seen set to prevent it from being extracted as a regular link
|
|
||||||
seen.add(cleanImageUrl); |
|
||||||
|
|
||||||
if (cleanLinkUrl && cleanLinkUrl.match(/^https?:\/\//i) && !isNostrUrl(cleanLinkUrl) && !seen.has(cleanLinkUrl)) { |
|
||||||
seen.add(cleanLinkUrl); |
|
||||||
links.push({ |
|
||||||
url: cleanLinkUrl, |
|
||||||
text: altText.trim() || 'Image link', // Use the alt text from the image (e.g., "Youtube link with pic")
|
|
||||||
isExternal: isExternalUrl(cleanLinkUrl, linkBaseURL), |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Now extract regular markdown links: [text](url) - but NOT images 
|
|
||||||
// Use a pattern that explicitly excludes images by checking before the match
|
|
||||||
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g; |
|
||||||
let markdownMatch; |
|
||||||
while ((markdownMatch = markdownLinkPattern.exec(processedContent)) !== null) { |
|
||||||
// Check if this is an image (preceded by !)
|
|
||||||
// We need to check the character immediately before the opening bracket
|
|
||||||
const matchIndex = markdownMatch.index; |
|
||||||
if (matchIndex > 0) { |
|
||||||
const charBefore = processedContent[matchIndex - 1]; |
|
||||||
if (charBefore === '!') { |
|
||||||
continue; // Skip images - this is , not [text](url)
|
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
let [, text, url] = markdownMatch; |
|
||||||
|
|
||||||
// Skip if this is a nested image link (we already extracted those above)
|
|
||||||
if (text.trim().startsWith(') { |
|
||||||
continue; // Already handled by nestedImageLinkPattern
|
|
||||||
} |
|
||||||
|
|
||||||
// Handle AsciiDoc image syntax in markdown links: [image::url[alt,width=100%]](link-url)
|
|
||||||
// This happens when AsciiDoc content is converted to markdown-style links
|
|
||||||
if (text.trim().startsWith('image::') || text.trim().startsWith('image:')) { |
|
||||||
// Match image::url[alt,attributes] or image:url[alt,attributes]
|
|
||||||
const imageMatch = text.match(/^image:?:[^\[]+\[([^\],]+)/); |
|
||||||
if (imageMatch) { |
|
||||||
text = imageMatch[1].trim(); // Use just the alt text (e.g., "Youtube link with pic")
|
|
||||||
} else { |
|
||||||
// If we can't extract alt text, use a default
|
|
||||||
text = 'Image link'; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Clean up URL - remove trailing punctuation that might have been captured
|
|
||||||
// But preserve parentheses that are part of the URL (like in query strings)
|
|
||||||
// Only remove trailing punctuation that's clearly not part of the URL
|
|
||||||
url = url.trim(); |
|
||||||
|
|
||||||
// Remove trailing punctuation that's likely not part of the URL
|
|
||||||
// But be careful - URLs can end with ) if they're in markdown like [text](url))
|
|
||||||
// We'll be conservative and only remove if it's clearly punctuation
|
|
||||||
url = url.replace(/[)\].,;:!?`]+$/, ''); |
|
||||||
|
|
||||||
// Clean up text - remove stray punctuation and whitespace
|
|
||||||
text = text.trim(); |
|
||||||
|
|
||||||
// Skip if URL is empty or invalid
|
|
||||||
if (!url || !url.match(/^https?:\/\//i)) { |
|
||||||
continue; |
|
||||||
} |
|
||||||
|
|
||||||
if (!seen.has(url) && !isNostrUrl(url)) { |
|
||||||
seen.add(url); |
|
||||||
links.push({ |
|
||||||
url, |
|
||||||
text, |
|
||||||
isExternal: isExternalUrl(url, linkBaseURL), |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
|
|
||||||
// Handle nested image links: link:url[image::image-url[alt,width=100%]]
|
|
||||||
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g; |
|
||||||
let asciidocMatch; |
|
||||||
while ((asciidocMatch = asciidocLinkPattern.exec(processedContent)) !== null) { |
|
||||||
let [, url, text] = asciidocMatch; |
|
||||||
|
|
||||||
// Clean up URL
|
|
||||||
url = url.trim(); |
|
||||||
|
|
||||||
// Handle nested image syntax in AsciiDoc: image::url[alt,width=100%]
|
|
||||||
// Extract just the alt text from the image syntax
|
|
||||||
if (text.trim().startsWith('image::') || text.trim().startsWith('image:')) { |
|
||||||
// Match image::url[alt,attributes] or image:url[alt,attributes]
|
|
||||||
const imageMatch = text.match(/^image:?:[^\[]+\[([^\],]+)/); |
|
||||||
if (imageMatch) { |
|
||||||
text = imageMatch[1].trim(); // Use just the alt text
|
|
||||||
} else { |
|
||||||
// If we can't extract alt text, skip this link (it's an image, not a text link)
|
|
||||||
continue; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Clean up text
|
|
||||||
text = text.trim(); |
|
||||||
|
|
||||||
// Skip if URL is empty or invalid
|
|
||||||
if (!url || !url.match(/^https?:\/\//i)) { |
|
||||||
continue; |
|
||||||
} |
|
||||||
|
|
||||||
if (!seen.has(url) && !isNostrUrl(url)) { |
|
||||||
seen.add(url); |
|
||||||
links.push({ |
|
||||||
url, |
|
||||||
text, |
|
||||||
isExternal: isExternalUrl(url, linkBaseURL), |
|
||||||
}); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Extract raw URLs (basic pattern) - but exclude those already in markdown/asciidoc links
|
|
||||||
// More restrictive pattern to avoid capturing trailing punctuation
|
|
||||||
const urlPattern = /https?:\/\/[^\s<>"'`()\[\]]+/g; |
|
||||||
const rawUrls = processedContent.match(urlPattern) || []; |
|
||||||
rawUrls.forEach(url => { |
|
||||||
// Remove trailing punctuation that might have been captured
|
|
||||||
url = url.replace(/[)\].,;:!?`]+$/, ''); |
|
||||||
|
|
||||||
// Skip if URL is too short or invalid
|
|
||||||
if (!url || url.length < 10 || !url.match(/^https?:\/\/[^\s]+$/i)) { |
|
||||||
return; |
|
||||||
} |
|
||||||
|
|
||||||
// Skip if this is an inner image URL from a nested image link
|
|
||||||
if (nestedImageUrls.has(url)) { |
|
||||||
return; |
|
||||||
} |
|
||||||
|
|
||||||
if (!seen.has(url) && !isNostrUrl(url)) { |
|
||||||
seen.add(url); |
|
||||||
links.push({ |
|
||||||
url, |
|
||||||
text: url, |
|
||||||
isExternal: isExternalUrl(url, linkBaseURL), |
|
||||||
}); |
|
||||||
} |
|
||||||
}); |
|
||||||
|
|
||||||
return links; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Extract media URLs from content |
|
||||||
*/ |
|
||||||
function extractMedia(content: string): string[] { |
|
||||||
const media: string[] = []; |
|
||||||
const seen = new Set<string>(); |
|
||||||
|
|
||||||
// Extract markdown images:  - optimized to avoid double matching
|
|
||||||
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g; |
|
||||||
let markdownImageMatch; |
|
||||||
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) { |
|
||||||
const url = markdownImageMatch[1]; |
|
||||||
if (url && !seen.has(url)) { |
|
||||||
if (isImageUrl(url) || isVideoUrl(url)) { |
|
||||||
media.push(url); |
|
||||||
seen.add(url); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
|
|
||||||
const asciidocImagePattern = /image::([^\[]+)\[/g; |
|
||||||
let asciidocImageMatch; |
|
||||||
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) { |
|
||||||
const url = asciidocImageMatch[1]; |
|
||||||
if (url && !seen.has(url)) { |
|
||||||
if (isImageUrl(url) || isVideoUrl(url)) { |
|
||||||
media.push(url); |
|
||||||
seen.add(url); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Extract raw image/video URLs
|
|
||||||
const urlPattern = /https?:\/\/[^\s<>"']+/g; |
|
||||||
const rawUrls = content.match(urlPattern) || []; |
|
||||||
rawUrls.forEach(url => { |
|
||||||
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) { |
|
||||||
media.push(url); |
|
||||||
seen.add(url); |
|
||||||
} |
|
||||||
}); |
|
||||||
|
|
||||||
return media; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Get Nostr identifier type |
|
||||||
*/ |
|
||||||
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null { |
|
||||||
if (id.startsWith('npub')) return 'npub'; |
|
||||||
if (id.startsWith('nprofile')) return 'nprofile'; |
|
||||||
if (id.startsWith('nevent')) return 'nevent'; |
|
||||||
if (id.startsWith('naddr')) return 'naddr'; |
|
||||||
if (id.startsWith('note')) return 'note'; |
|
||||||
return null; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Normalize text to d-tag format |
|
||||||
*/ |
|
||||||
function normalizeDtag(text: string): string { |
|
||||||
return text |
|
||||||
.toLowerCase() |
|
||||||
.replace(/[^a-z0-9]+/g, '-') |
|
||||||
.replace(/^-+|-+$/g, ''); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Check if URL is external |
|
||||||
*/ |
|
||||||
function isExternalUrl(url: string, linkBaseURL: string): boolean { |
|
||||||
if (!linkBaseURL) return true; |
|
||||||
try { |
|
||||||
// Use a simple string-based check for Node.js compatibility
|
|
||||||
// Extract hostname from URL string
|
|
||||||
const urlMatch = url.match(/^https?:\/\/([^\/]+)/); |
|
||||||
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
|
||||||
|
|
||||||
if (urlMatch && baseMatch) { |
|
||||||
return urlMatch[1] !== baseMatch[1]; |
|
||||||
} |
|
||||||
return true; |
|
||||||
} catch { |
|
||||||
return true; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Check if URL is a Nostr URL |
|
||||||
*/ |
|
||||||
function isNostrUrl(url: string): boolean { |
|
||||||
return url.startsWith('nostr:') || getNostrType(url) !== null; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Check if URL is an image |
|
||||||
*/ |
|
||||||
function isImageUrl(url: string): boolean { |
|
||||||
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Check if URL is a video |
|
||||||
*/ |
|
||||||
function isVideoUrl(url: string): boolean { |
|
||||||
return /\.(mp4|webm|ogg)$/i.test(url); |
|
||||||
} |
|
||||||
@ -1,3 +1,3 @@ |
|||||||
export * from './parser'; |
export * from './parser'; |
||||||
export * from './types'; |
export * from './types'; |
||||||
export * from './detector'; |
export * from './detector'; |
||||||
|
|||||||
@ -1,92 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.Parser = void 0; |
|
||||||
exports.defaultOptions = defaultOptions; |
|
||||||
exports.process = process; |
|
||||||
const detector_1 = require("./detector"); |
|
||||||
const to_asciidoc_1 = require("./converters/to-asciidoc"); |
|
||||||
const asciidoc_1 = require("./processors/asciidoc"); |
|
||||||
const metadata_1 = require("./extractors/metadata"); |
|
||||||
const frontmatter_1 = require("./extractors/frontmatter"); |
|
||||||
/** |
|
||||||
* Default parser options |
|
||||||
*/ |
|
||||||
function defaultOptions() { |
|
||||||
return { |
|
||||||
linkBaseURL: '', |
|
||||||
enableAsciiDoc: true, |
|
||||||
enableMarkdown: true, |
|
||||||
enableCodeHighlighting: true, |
|
||||||
enableLaTeX: true, |
|
||||||
enableMusicalNotation: true, |
|
||||||
enableNostrAddresses: true, |
|
||||||
}; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Main parser for Nostr event content |
|
||||||
* Handles multiple content formats: AsciiDoc, Markdown, code syntax, |
|
||||||
* LaTeX, musical notation, and nostr: prefixed addresses |
|
||||||
* |
|
||||||
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor |
|
||||||
*/ |
|
||||||
class Parser { |
|
||||||
constructor(options = {}) { |
|
||||||
const defaults = defaultOptions(); |
|
||||||
this.options = { |
|
||||||
linkBaseURL: options.linkBaseURL ?? defaults.linkBaseURL ?? '', |
|
||||||
enableAsciiDoc: options.enableAsciiDoc ?? defaults.enableAsciiDoc ?? true, |
|
||||||
enableMarkdown: options.enableMarkdown ?? defaults.enableMarkdown ?? true, |
|
||||||
enableCodeHighlighting: options.enableCodeHighlighting ?? defaults.enableCodeHighlighting ?? true, |
|
||||||
enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true, |
|
||||||
enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true, |
|
||||||
enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true, |
|
||||||
wikilinkUrl: options.wikilinkUrl ?? defaults.wikilinkUrl, |
|
||||||
hashtagUrl: options.hashtagUrl ?? defaults.hashtagUrl, |
|
||||||
}; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Process Nostr event content and return HTML |
|
||||||
* Automatically detects the content format and processes accordingly |
|
||||||
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor |
|
||||||
*/ |
|
||||||
async process(content) { |
|
||||||
// Extract frontmatter first (before any other processing)
|
|
||||||
const { frontmatter, content: contentWithoutFrontmatter } = (0, frontmatter_1.extractFrontmatter)(content); |
|
||||||
// Extract metadata from content (after removing frontmatter)
|
|
||||||
const metadata = (0, metadata_1.extractMetadata)(contentWithoutFrontmatter, this.options.linkBaseURL); |
|
||||||
// Detect content format (on content without frontmatter)
|
|
||||||
const format = (0, detector_1.detectFormat)(contentWithoutFrontmatter); |
|
||||||
// Convert everything to AsciiDoc format first
|
|
||||||
const asciidocContent = (0, to_asciidoc_1.convertToAsciidoc)(contentWithoutFrontmatter, format, this.options.linkBaseURL, { |
|
||||||
enableNostrAddresses: this.options.enableNostrAddresses, |
|
||||||
}); |
|
||||||
// Process through AsciiDoctor
|
|
||||||
const result = await (0, asciidoc_1.processAsciidoc)(asciidocContent, { |
|
||||||
enableCodeHighlighting: this.options.enableCodeHighlighting, |
|
||||||
enableLaTeX: this.options.enableLaTeX, |
|
||||||
enableMusicalNotation: this.options.enableMusicalNotation, |
|
||||||
originalContent: contentWithoutFrontmatter, // Pass original for LaTeX detection
|
|
||||||
linkBaseURL: this.options.linkBaseURL, // Pass linkBaseURL for link processing
|
|
||||||
wikilinkUrl: this.options.wikilinkUrl, // Pass wikilink URL format
|
|
||||||
hashtagUrl: this.options.hashtagUrl, // Pass hashtag URL format
|
|
||||||
}); |
|
||||||
// Combine with extracted metadata and frontmatter
|
|
||||||
return { |
|
||||||
...result, |
|
||||||
frontmatter, |
|
||||||
nostrLinks: metadata.nostrLinks, |
|
||||||
wikilinks: metadata.wikilinks, |
|
||||||
hashtags: metadata.hashtags, |
|
||||||
links: metadata.links, |
|
||||||
media: metadata.media, |
|
||||||
}; |
|
||||||
} |
|
||||||
} |
|
||||||
exports.Parser = Parser; |
|
||||||
/** |
|
||||||
* Convenience function to process content with default options |
|
||||||
*/ |
|
||||||
async function process(content, options) { |
|
||||||
const parser = new Parser(options); |
|
||||||
return parser.process(content); |
|
||||||
} |
|
||||||
@ -0,0 +1,481 @@ |
|||||||
|
import { ParserOptions, NostrLink, Wikilink } from './types'; |
||||||
|
|
||||||
|
/** |
||||||
|
* Extract and process wikilinks, hashtags, and nostr: addresses from HTML |
||||||
|
*/ |
||||||
|
export interface PostProcessResult { |
||||||
|
html: string; |
||||||
|
nostrLinks: NostrLink[]; |
||||||
|
wikilinks: Wikilink[]; |
||||||
|
hashtags: string[]; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Post-process HTML to convert wikilinks, hashtags, and nostr: addresses |
||||||
|
* @param skipWikilinksAndHashtags - If true, skip processing wikilinks and hashtags (already processed) |
||||||
|
*/ |
||||||
|
export function postProcess(html: string, options: ParserOptions, skipWikilinksAndHashtags: boolean = false): PostProcessResult { |
||||||
|
let processed = html; |
||||||
|
const nostrLinks: NostrLink[] = []; |
||||||
|
const wikilinks: Wikilink[] = []; |
||||||
|
const hashtags: string[] = []; |
||||||
|
|
||||||
|
// First, mark code blocks to avoid processing inside them
|
||||||
|
const codeBlockMarkers: Array<{ start: number; end: number }> = []; |
||||||
|
const codeBlockRegex = /<(pre|code)[^>]*>[\s\S]*?<\/\1>/gi; |
||||||
|
let match; |
||||||
|
while ((match = codeBlockRegex.exec(html)) !== null) { |
||||||
|
codeBlockMarkers.push({ start: match.index, end: match.index + match[0].length }); |
||||||
|
} |
||||||
|
|
||||||
|
function isInCodeBlock(index: number): boolean { |
||||||
|
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end); |
||||||
|
} |
||||||
|
|
||||||
|
// Process nostr: addresses (but not in code blocks)
|
||||||
|
if (options.enableNostrAddresses !== false) { |
||||||
|
const nostrRegex = /nostr:([np][a-z0-9]+1[a-z0-9]+)/gi; |
||||||
|
const replacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||||
|
|
||||||
|
while ((match = nostrRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
|
||||||
|
const bech32 = match[1]; |
||||||
|
const type = getNostrType(bech32); |
||||||
|
if (!type) continue; |
||||||
|
|
||||||
|
const link: NostrLink = { |
||||||
|
type, |
||||||
|
id: bech32, |
||||||
|
text: match[0], |
||||||
|
bech32: bech32 |
||||||
|
}; |
||||||
|
nostrLinks.push(link); |
||||||
|
|
||||||
|
const url = options.linkBaseURL
|
||||||
|
? `${options.linkBaseURL}/nostr/${bech32}` |
||||||
|
: `#nostr-${bech32}`; |
||||||
|
|
||||||
|
replacements.push({ |
||||||
|
match: match[0], |
||||||
|
replacement: `<a href="${escapeHtml(url)}" class="nostr-link" data-nostr-type="${type}" data-nostr-id="${escapeHtml(bech32)}">${escapeHtml(match[0])}</a>`, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Apply replacements in reverse order to preserve indices
|
||||||
|
replacements.reverse().forEach(({ match, replacement, index }) => { |
||||||
|
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Process wikilinks: [[dtag]] or [[dtag|display]] (but not in code blocks)
|
||||||
|
// Skip if already processed (for AsciiDoc)
|
||||||
|
if (!skipWikilinksAndHashtags) { |
||||||
|
const wikilinkRegex = /\[\[([^\]]+)\]\]/g; |
||||||
|
const wikilinkReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||||
|
|
||||||
|
while ((match = wikilinkRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
|
||||||
|
// Skip if already inside a link tag
|
||||||
|
const beforeMatch = processed.substring(0, match.index); |
||||||
|
const lastOpenTag = beforeMatch.lastIndexOf('<a'); |
||||||
|
const lastCloseTag = beforeMatch.lastIndexOf('</a>'); |
||||||
|
if (lastOpenTag > lastCloseTag) continue; // Inside a link
|
||||||
|
|
||||||
|
const content = match[1]; |
||||||
|
const parts = content.split('|'); |
||||||
|
const dtag = parts[0].trim(); |
||||||
|
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag; |
||||||
|
|
||||||
|
const wikilink: Wikilink = { |
||||||
|
dtag, |
||||||
|
display, |
||||||
|
original: match[0] |
||||||
|
}; |
||||||
|
wikilinks.push(wikilink); |
||||||
|
|
||||||
|
let url: string; |
||||||
|
if (typeof options.wikilinkUrl === 'function') { |
||||||
|
url = options.wikilinkUrl(dtag); |
||||||
|
} else if (typeof options.wikilinkUrl === 'string') { |
||||||
|
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(dtag)); |
||||||
|
} else { |
||||||
|
url = options.linkBaseURL
|
||||||
|
? `${options.linkBaseURL}/events?d=${encodeURIComponent(dtag)}` |
||||||
|
: `#${encodeURIComponent(dtag)}`; |
||||||
|
} |
||||||
|
|
||||||
|
wikilinkReplacements.push({ |
||||||
|
match: match[0], |
||||||
|
replacement: `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(dtag)}">${escapeHtml(display)}</a>`, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Apply wikilink replacements in reverse order
|
||||||
|
wikilinkReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||||
|
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||||
|
}); |
||||||
|
|
||||||
|
// Process hashtags: #hashtag (but not in code blocks or inside HTML tags)
|
||||||
|
const hashtagRegex = /(^|\s|>)(#[\w-]+)/g; |
||||||
|
const hashtagReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||||
|
|
||||||
|
while ((match = hashtagRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
|
||||||
|
// Check if we're inside an HTML tag
|
||||||
|
const beforeMatch = processed.substring(0, match.index); |
||||||
|
const lastOpenTag = beforeMatch.lastIndexOf('<'); |
||||||
|
const lastCloseTag = beforeMatch.lastIndexOf('>'); |
||||||
|
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
|
||||||
|
|
||||||
|
// Skip if already inside a link or span
|
||||||
|
const lastLinkOpen = beforeMatch.lastIndexOf('<a'); |
||||||
|
const lastLinkClose = beforeMatch.lastIndexOf('</a>'); |
||||||
|
const lastSpanOpen = beforeMatch.lastIndexOf('<span'); |
||||||
|
const lastSpanClose = beforeMatch.lastIndexOf('</span>'); |
||||||
|
if (lastLinkOpen > lastLinkClose || lastSpanOpen > lastSpanClose) continue; |
||||||
|
|
||||||
|
const hashtag = match[2]; |
||||||
|
const prefix = match[1]; |
||||||
|
const topic = hashtag.substring(1); |
||||||
|
|
||||||
|
if (!hashtags.includes(topic)) { |
||||||
|
hashtags.push(topic); |
||||||
|
} |
||||||
|
|
||||||
|
let url: string | undefined; |
||||||
|
if (typeof options.hashtagUrl === 'function') { |
||||||
|
url = options.hashtagUrl(topic); |
||||||
|
} else if (typeof options.hashtagUrl === 'string') { |
||||||
|
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic)); |
||||||
|
} |
||||||
|
|
||||||
|
const replacement = url |
||||||
|
? `${prefix}<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>` |
||||||
|
: `${prefix}<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`; |
||||||
|
|
||||||
|
hashtagReplacements.push({ |
||||||
|
match: match[0], |
||||||
|
replacement, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Apply hashtag replacements in reverse order
|
||||||
|
hashtagReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||||
|
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Extract wikilinks and hashtags from already-processed HTML (for AsciiDoc)
|
||||||
|
if (skipWikilinksAndHashtags) { |
||||||
|
// Extract wikilinks from existing links
|
||||||
|
const wikilinkLinkRegex = /<a[^>]+class="wikilink"[^>]+data-dtag="([^"]+)"[^>]*>([^<]+)<\/a>/g; |
||||||
|
while ((match = wikilinkLinkRegex.exec(processed)) !== null) { |
||||||
|
wikilinks.push({ |
||||||
|
dtag: match[1], |
||||||
|
display: match[2], |
||||||
|
original: match[0] |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Extract hashtags from existing spans/links
|
||||||
|
const hashtagRegex = /<(?:a|span)[^>]+class="hashtag"[^>]+data-topic="([^"]+)"[^>]*>#\1<\/\w+>/g; |
||||||
|
while ((match = hashtagRegex.exec(processed)) !== null) { |
||||||
|
const topic = match[1]; |
||||||
|
if (!hashtags.includes(topic)) { |
||||||
|
hashtags.push(topic); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Remove links inside code blocks (both <code> and <pre> tags)
|
||||||
|
// This ensures URLs in code blocks remain as plain text
|
||||||
|
const codeBlockLinkRegex = /(<(?:code|pre)[^>]*>)([\s\S]*?)(<\/(?:code|pre)>)/gi; |
||||||
|
processed = processed.replace(codeBlockLinkRegex, (match, openTag, content, closeTag) => { |
||||||
|
// Remove all <a> tags inside code blocks, keeping only the text content
|
||||||
|
const cleanedContent = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1'); |
||||||
|
return openTag + cleanedContent + closeTag; |
||||||
|
}); |
||||||
|
|
||||||
|
// Process YouTube URLs - ORDER IS CRITICAL to avoid double-parsing
|
||||||
|
// 1. FIRST: Fix video tags that contain YouTube URLs (before they get processed as bare URLs)
|
||||||
|
// AsciiDoc's video:: macro creates <video> tags, but YouTube URLs should be iframes
|
||||||
|
const youtubeVideoTagRegex = /<video[^>]+src="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>[\s\S]*?<\/video>/gi; |
||||||
|
processed = processed.replace(youtubeVideoTagRegex, (match, url, videoId) => { |
||||||
|
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||||
|
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||||
|
}); |
||||||
|
|
||||||
|
// 2. SECOND: Process YouTube links in <a> tags
|
||||||
|
// IMPORTANT: Be very specific with YouTube regex to avoid matching Spotify URLs
|
||||||
|
const youtubeLinkRegex = /<a[^>]+href="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>.*?<\/a>/gi; |
||||||
|
processed = processed.replace(youtubeLinkRegex, (match, url, videoId) => { |
||||||
|
if (isInCodeBlock(processed.indexOf(match))) return match; |
||||||
|
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||||
|
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||||
|
}); |
||||||
|
|
||||||
|
// 3. THIRD: Fix malformed YouTube iframes from AsciiDoc video:: macro
|
||||||
|
// AsciiDoc sometimes creates iframes with malformed YouTube URLs (watch?v= or shorts/ instead of embed/)
|
||||||
|
// Match the entire iframe element including closing tag to avoid duplicates
|
||||||
|
const malformedYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube[^"]*(?:watch\?v=|shorts\/)([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi; |
||||||
|
processed = processed.replace(malformedYoutubeIframeRegex, (match, videoId) => { |
||||||
|
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||||
|
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||||
|
}); |
||||||
|
|
||||||
|
// 3.5: Fix YouTube iframes with embed URLs but wrong parameters or missing required attributes
|
||||||
|
// AsciiDoc's video:: macro creates iframes with ?rel=0 or missing allow/referrerpolicy attributes
|
||||||
|
// Match iframes with embed URLs that don't have enablejsapi=1 or are missing required attributes
|
||||||
|
const incompleteYoutubeIframeRegex = /<iframe[^>]+src="https?:\/\/(?:www\.)?youtube\.com\/embed\/([a-zA-Z0-9_-]+)(\?[^"]*)?"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi; |
||||||
|
processed = processed.replace(incompleteYoutubeIframeRegex, (match, videoId, params) => { |
||||||
|
// Check if this iframe already has the correct format (has enablejsapi=1 and required attributes)
|
||||||
|
if (match.includes('enablejsapi=1') &&
|
||||||
|
match.includes('allow=') &&
|
||||||
|
match.includes('referrerpolicy=') && |
||||||
|
match.includes('class="youtube-embed"')) { |
||||||
|
return match; // Already correct, don't modify
|
||||||
|
} |
||||||
|
// Fix the iframe with proper attributes
|
||||||
|
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||||
|
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||||
|
}); |
||||||
|
|
||||||
|
// 4. FOURTH: Fix any existing YouTube iframes that have malformed embed URLs (AsciiDoc sometimes creates broken embed URLs)
|
||||||
|
// Match the entire iframe element including closing tag to avoid duplicates
|
||||||
|
const brokenYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube\.com\/embed\/[^"]*watch\?v=([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi; |
||||||
|
processed = processed.replace(brokenYoutubeIframeRegex, (match, videoId) => { |
||||||
|
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||||
|
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||||
|
}); |
||||||
|
|
||||||
|
// 5. LAST: Handle bare YouTube URLs (not in links, video tags, or iframes)
|
||||||
|
// IMPORTANT: Match must be specific to youtube.com or youtu.be to avoid matching Spotify
|
||||||
|
// This must come AFTER processing video tags and links to avoid double-parsing
|
||||||
|
const bareYoutubeRegex = /(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)(?:\?[^"\s<>]*)?)/gi; |
||||||
|
const youtubeReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||||
|
while ((match = bareYoutubeRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
|
||||||
|
// Check if it's already in a tag (link, iframe, video, etc.)
|
||||||
|
// Simple approach: check if we're inside quotes (attribute value) or between <tag and >
|
||||||
|
const before = processed.substring(Math.max(0, match.index - 500), match.index); |
||||||
|
const after = processed.substring(match.index, match.index + match[0].length + 100); |
||||||
|
|
||||||
|
// Check if URL is inside quotes (attribute value like src="..." or href="...")
|
||||||
|
const beforeContext = before.substring(Math.max(0, before.length - 100)); |
||||||
|
if (beforeContext.match(/<(iframe|video|a|img|audio|source)[^>]*\s+(src|href)="[^"]*$/i)) { |
||||||
|
continue; // Inside an attribute value, skip
|
||||||
|
} |
||||||
|
|
||||||
|
// Check if we're between an opening tag and its closing bracket
|
||||||
|
const lastOpenTag = before.lastIndexOf('<'); |
||||||
|
const lastCloseBracket = before.lastIndexOf('>'); |
||||||
|
if (lastOpenTag > lastCloseBracket) { |
||||||
|
// We're inside a tag, check what kind
|
||||||
|
const tagContent = before.substring(lastOpenTag); |
||||||
|
if (/<(iframe|video|a|img|audio|source)[^>]*$/i.test(tagContent)) { |
||||||
|
continue; // Skip URLs inside these tags
|
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
const videoId = match[2]; |
||||||
|
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||||
|
youtubeReplacements.push({ |
||||||
|
match: match[0], |
||||||
|
replacement: `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
youtubeReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||||
|
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||||
|
}); |
||||||
|
|
||||||
|
// Fix double-closed iframes (safety net)
|
||||||
|
processed = processed.replace(/<\/iframe><\/iframe>/gi, '</iframe>'); |
||||||
|
|
||||||
|
// Spotify: https://open.spotify.com/episode/ID or https://open.spotify.com/track/ID or https://open.spotify.com/album/ID
|
||||||
|
const spotifyLinkRegex = /<a[^>]+href="(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+))[^"]*"[^>]*>.*?<\/a>/gi; |
||||||
|
processed = processed.replace(spotifyLinkRegex, (match, url, type, id) => { |
||||||
|
if (isInCodeBlock(processed.indexOf(match))) return match; |
||||||
|
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`; |
||||||
|
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`; |
||||||
|
}); |
||||||
|
|
||||||
|
// Also handle bare Spotify URLs (not in links)
|
||||||
|
const bareSpotifyRegex = /(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)(?:\?[^"\s<>]*)?)/gi; |
||||||
|
const spotifyReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||||
|
while ((match = bareSpotifyRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
// Check if it's already in a tag
|
||||||
|
const before = processed.substring(0, match.index); |
||||||
|
const lastOpenTag = before.lastIndexOf('<'); |
||||||
|
const lastCloseTag = before.lastIndexOf('>'); |
||||||
|
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
|
||||||
|
|
||||||
|
const type = match[2]; |
||||||
|
const id = match[3]; |
||||||
|
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`; |
||||||
|
spotifyReplacements.push({ |
||||||
|
match: match[0], |
||||||
|
replacement: `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
spotifyReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||||
|
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||||
|
}); |
||||||
|
|
||||||
|
// Process bare image/media URLs that aren't already in tags
|
||||||
|
// First, convert bare links (class="bare") that contain image/video/audio URLs to actual media elements
|
||||||
|
// This handles cases where AsciiDoc has already converted URLs to links
|
||||||
|
// IMPORTANT: Check YouTube FIRST, then Spotify, BEFORE checking file extensions to avoid conflicts
|
||||||
|
const bareLinkRegex = /<a[^>]+href="(https?:\/\/[^"]+)"[^>]*class="[^"]*bare[^"]*"[^>]*>([^<]*)<\/a>/gi; |
||||||
|
processed = processed.replace(bareLinkRegex, (match, url, linkText) => { |
||||||
|
if (isInCodeBlock(processed.indexOf(match))) return match; |
||||||
|
|
||||||
|
// Check YouTube URLs FIRST (be very specific - must be youtube.com or youtu.be)
|
||||||
|
// This prevents accidentally matching Spotify URLs
|
||||||
|
const youtubeMatch = url.match(/https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)/); |
||||||
|
if (youtubeMatch) { |
||||||
|
const videoId = youtubeMatch[1]; |
||||||
|
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`; |
||||||
|
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`; |
||||||
|
} |
||||||
|
|
||||||
|
// Check Spotify URLs (be very specific - must be open.spotify.com)
|
||||||
|
const spotifyMatch = url.match(/https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)/); |
||||||
|
if (spotifyMatch) { |
||||||
|
const type = spotifyMatch[1]; |
||||||
|
const id = spotifyMatch[2]; |
||||||
|
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`; |
||||||
|
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`; |
||||||
|
} |
||||||
|
|
||||||
|
// Check if it's an image URL
|
||||||
|
if (/\.(jpg|jpeg|png|gif|webp|svg|bmp)(\?|$)/i.test(url)) { |
||||||
|
return `<img src="${escapeHtml(url)}" alt="${escapeHtml(linkText)}" class="bare-image" />`; |
||||||
|
} |
||||||
|
// Check if it's a video URL (but not YouTube)
|
||||||
|
if (/\.(mp4|webm|ogg|mov|avi)(\?|$)/i.test(url)) { |
||||||
|
return `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`; |
||||||
|
} |
||||||
|
// Check if it's an audio URL (but not Spotify)
|
||||||
|
if (/\.(mp3|wav|ogg|flac|aac|m4a)(\?|$)/i.test(url)) { |
||||||
|
return `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`; |
||||||
|
} |
||||||
|
|
||||||
|
// Not a media URL, return as-is
|
||||||
|
return match; |
||||||
|
}); |
||||||
|
|
||||||
|
// Now process bare URLs that aren't in any tags at all
|
||||||
|
// IMPORTANT: Skip YouTube and Spotify URLs - they're already processed above
|
||||||
|
const imageUrlRegex = /(https?:\/\/[^\s<>"']+\.(jpg|jpeg|png|gif|webp|svg|bmp))(?![^<]*>)/gi; |
||||||
|
const videoUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp4|webm|ogg|mov|avi))(?![^<]*>)/gi; |
||||||
|
const audioUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp3|wav|ogg|flac|aac|m4a))(?![^<]*>)/gi; |
||||||
|
|
||||||
|
// Check if URL is already in a tag
|
||||||
|
function isUrlInTag(url: string, index: number): boolean { |
||||||
|
const before = processed.substring(0, index); |
||||||
|
const after = processed.substring(index); |
||||||
|
|
||||||
|
// Check if it's inside an existing tag
|
||||||
|
const lastOpenTag = before.lastIndexOf('<'); |
||||||
|
const lastCloseTag = before.lastIndexOf('>'); |
||||||
|
if (lastOpenTag > lastCloseTag) { |
||||||
|
const tagContent = processed.substring(lastOpenTag, index + url.length); |
||||||
|
if (/<(img|video|audio|a|source|iframe)[^>]*>/i.test(tagContent)) { |
||||||
|
return true; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return false; |
||||||
|
} |
||||||
|
|
||||||
|
const mediaReplacements: Array<{ match: string; replacement: string; index: number }> = []; |
||||||
|
|
||||||
|
// Process images
|
||||||
|
while ((match = imageUrlRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
if (isUrlInTag(match[0], match.index)) continue; |
||||||
|
|
||||||
|
const url = match[0]; |
||||||
|
mediaReplacements.push({ |
||||||
|
match: url, |
||||||
|
replacement: `<img src="${escapeHtml(url)}" alt="" class="bare-image" />`, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Process videos (but skip YouTube URLs - they're handled above)
|
||||||
|
while ((match = videoUrlRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
if (isUrlInTag(match[0], match.index)) continue; |
||||||
|
// Skip YouTube URLs - they should be embeds, not video tags
|
||||||
|
if (/youtube\.com|youtu\.be/i.test(match[0])) continue; |
||||||
|
|
||||||
|
const url = match[0]; |
||||||
|
mediaReplacements.push({ |
||||||
|
match: url, |
||||||
|
replacement: `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Process audio
|
||||||
|
while ((match = audioUrlRegex.exec(processed)) !== null) { |
||||||
|
if (isInCodeBlock(match.index)) continue; |
||||||
|
if (isUrlInTag(match[0], match.index)) continue; |
||||||
|
|
||||||
|
const url = match[0]; |
||||||
|
mediaReplacements.push({ |
||||||
|
match: url, |
||||||
|
replacement: `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`, |
||||||
|
index: match.index |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
// Apply media replacements in reverse order
|
||||||
|
mediaReplacements.reverse().forEach(({ match, replacement, index }) => { |
||||||
|
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length); |
||||||
|
}); |
||||||
|
|
||||||
|
return { |
||||||
|
html: processed, |
||||||
|
nostrLinks, |
||||||
|
wikilinks, |
||||||
|
hashtags |
||||||
|
}; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Get Nostr identifier type from bech32 string |
||||||
|
*/ |
||||||
|
function getNostrType(bech32: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null { |
||||||
|
if (bech32.startsWith('npub')) return 'npub'; |
||||||
|
if (bech32.startsWith('nprofile')) return 'nprofile'; |
||||||
|
if (bech32.startsWith('nevent')) return 'nevent'; |
||||||
|
if (bech32.startsWith('naddr')) return 'naddr'; |
||||||
|
if (bech32.startsWith('note')) return 'note'; |
||||||
|
return null; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Escape HTML special characters |
||||||
|
*/ |
||||||
|
function escapeHtml(text: string): string { |
||||||
|
const map: Record<string, string> = { |
||||||
|
'&': '&', |
||||||
|
'<': '<', |
||||||
|
'>': '>', |
||||||
|
'"': '"', |
||||||
|
"'": ''' |
||||||
|
}; |
||||||
|
return text.replace(/[&<>"']/g, (m) => map[m]); |
||||||
|
} |
||||||
@ -0,0 +1,175 @@ |
|||||||
|
import { ParserOptions, Wikilink } from './types'; |
||||||
|
import * as emoji from 'node-emoji'; |
||||||
|
|
||||||
|
/** |
||||||
|
* Pre-process raw content to handle wikilinks and hashtags before AsciiDoc conversion |
||||||
|
* This prevents AsciiDoc from converting them to anchors or other formats |
||||||
|
*/ |
||||||
|
export interface PreProcessResult { |
||||||
|
content: string; |
||||||
|
wikilinks: Wikilink[]; |
||||||
|
hashtags: string[]; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Pre-process content to convert wikilinks and hashtags to placeholders |
||||||
|
* that will be processed after HTML conversion |
||||||
|
*/ |
||||||
|
export function preProcessAsciiDoc(content: string, options: ParserOptions): PreProcessResult { |
||||||
|
let processed = content; |
||||||
|
const wikilinks: Wikilink[] = []; |
||||||
|
const hashtags: string[] = []; |
||||||
|
|
||||||
|
// Process emojis first
|
||||||
|
processed = emoji.emojify(processed); |
||||||
|
|
||||||
|
// Process wikilinks: [[dtag]] or [[dtag|display]]
|
||||||
|
// Replace with a placeholder that AsciiDoc won't touch
|
||||||
|
const wikilinkRegex = /\[\[([^\]]+)\]\]/g; |
||||||
|
const wikilinkPlaceholders: Map<string, Wikilink> = new Map(); |
||||||
|
let placeholderCounter = 0; |
||||||
|
|
||||||
|
processed = processed.replace(wikilinkRegex, (match, content) => { |
||||||
|
const parts = content.split('|'); |
||||||
|
const dtag = parts[0].trim(); |
||||||
|
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag; |
||||||
|
|
||||||
|
const wikilink: Wikilink = { |
||||||
|
dtag, |
||||||
|
display, |
||||||
|
original: match |
||||||
|
}; |
||||||
|
wikilinks.push(wikilink); |
||||||
|
|
||||||
|
// Use a unique placeholder that won't be processed by AsciiDoc
|
||||||
|
// Use angle brackets to avoid AsciiDoc formatting interpretation
|
||||||
|
const placeholder = `<WIKILINK_PLACEHOLDER_${placeholderCounter}>`; |
||||||
|
wikilinkPlaceholders.set(placeholder, wikilink); |
||||||
|
placeholderCounter++; |
||||||
|
|
||||||
|
return placeholder; |
||||||
|
}); |
||||||
|
|
||||||
|
// Process hashtags: #hashtag (but not in code blocks)
|
||||||
|
// Mark code blocks first
|
||||||
|
const codeBlockMarkers: Array<{ start: number; end: number }> = []; |
||||||
|
const codeBlockRegex = /\[source,[^\]]+\]|\[abc\]|\[plantuml\]|```|`[^`]+`/g; |
||||||
|
let match; |
||||||
|
while ((match = codeBlockRegex.exec(processed)) !== null) { |
||||||
|
// Find the end of the code block
|
||||||
|
const start = match.index; |
||||||
|
let end = start + match[0].length; |
||||||
|
|
||||||
|
// For source blocks, find the closing ----
|
||||||
|
if (match[0].startsWith('[source')) { |
||||||
|
const afterStart = processed.substring(end); |
||||||
|
const closeMatch = afterStart.match(/^[\s\S]*?----/); |
||||||
|
if (closeMatch) { |
||||||
|
end = start + match[0].length + closeMatch[0].length; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
codeBlockMarkers.push({ start, end }); |
||||||
|
} |
||||||
|
|
||||||
|
function isInCodeBlock(index: number): boolean { |
||||||
|
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end); |
||||||
|
} |
||||||
|
|
||||||
|
// Process hashtags
|
||||||
|
const hashtagPlaceholders: Map<string, string> = new Map(); |
||||||
|
let hashtagCounter = 0; |
||||||
|
|
||||||
|
// Match hashtags at start of line, after whitespace, or after > (for blockquotes)
|
||||||
|
const hashtagRegex = /(^|\s|>)(#[\w-]+)/gm; |
||||||
|
|
||||||
|
processed = processed.replace(hashtagRegex, (match, prefix, hashtag, offset) => { |
||||||
|
if (isInCodeBlock(offset)) return match; |
||||||
|
|
||||||
|
const topic = hashtag.substring(1); |
||||||
|
if (!hashtags.includes(topic)) { |
||||||
|
hashtags.push(topic); |
||||||
|
} |
||||||
|
|
||||||
|
// Use angle brackets to avoid AsciiDoc formatting interpretation
|
||||||
|
const placeholder = `<HASHTAG_PLACEHOLDER_${hashtagCounter}>`; |
||||||
|
hashtagPlaceholders.set(placeholder, topic); |
||||||
|
hashtagCounter++; |
||||||
|
|
||||||
|
return `${prefix}${placeholder}`; |
||||||
|
}); |
||||||
|
|
||||||
|
return { |
||||||
|
content: processed, |
||||||
|
wikilinks, |
||||||
|
hashtags |
||||||
|
}; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Restore wikilinks and hashtags from placeholders in HTML |
||||||
|
*/ |
||||||
|
export function restorePlaceholders( |
||||||
|
html: string, |
||||||
|
wikilinks: Wikilink[], |
||||||
|
hashtags: string[], |
||||||
|
options: ParserOptions |
||||||
|
): string { |
||||||
|
let processed = html; |
||||||
|
|
||||||
|
// Restore wikilinks (handle both escaped and unescaped placeholders)
|
||||||
|
const wikilinkPlaceholderRegex = /<WIKILINK_PLACEHOLDER_(\d+)>|<WIKILINK_PLACEHOLDER_(\d+)>/g; |
||||||
|
processed = processed.replace(wikilinkPlaceholderRegex, (match, escapedIndex, unescapedIndex) => { |
||||||
|
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex); |
||||||
|
const wikilink = wikilinks[index]; |
||||||
|
if (!wikilink) return match; |
||||||
|
|
||||||
|
let url: string; |
||||||
|
if (typeof options.wikilinkUrl === 'function') { |
||||||
|
url = options.wikilinkUrl(wikilink.dtag); |
||||||
|
} else if (typeof options.wikilinkUrl === 'string') { |
||||||
|
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(wikilink.dtag)); |
||||||
|
} else { |
||||||
|
url = options.linkBaseURL
|
||||||
|
? `${options.linkBaseURL}/events?d=${encodeURIComponent(wikilink.dtag)}` |
||||||
|
: `#${encodeURIComponent(wikilink.dtag)}`; |
||||||
|
} |
||||||
|
|
||||||
|
return `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(wikilink.dtag)}">${escapeHtml(wikilink.display)}</a>`; |
||||||
|
}); |
||||||
|
|
||||||
|
// Restore hashtags (handle both escaped and unescaped placeholders)
|
||||||
|
const hashtagPlaceholderRegex = /<HASHTAG_PLACEHOLDER_(\d+)>|<HASHTAG_PLACEHOLDER_(\d+)>/g; |
||||||
|
processed = processed.replace(hashtagPlaceholderRegex, (match, escapedIndex, unescapedIndex) => { |
||||||
|
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex); |
||||||
|
const topic = hashtags[index]; |
||||||
|
if (!topic) return match; |
||||||
|
|
||||||
|
let url: string | undefined; |
||||||
|
if (typeof options.hashtagUrl === 'function') { |
||||||
|
url = options.hashtagUrl(topic); |
||||||
|
} else if (typeof options.hashtagUrl === 'string') { |
||||||
|
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic)); |
||||||
|
} |
||||||
|
|
||||||
|
const hashtag = `#${topic}`; |
||||||
|
if (url) { |
||||||
|
return `<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>`; |
||||||
|
} else { |
||||||
|
return `<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`; |
||||||
|
} |
||||||
|
}); |
||||||
|
|
||||||
|
return processed; |
||||||
|
} |
||||||
|
|
||||||
|
function escapeHtml(text: string): string { |
||||||
|
const map: Record<string, string> = { |
||||||
|
'&': '&', |
||||||
|
'<': '<', |
||||||
|
'>': '>', |
||||||
|
'"': '"', |
||||||
|
"'": ''' |
||||||
|
}; |
||||||
|
return text.replace(/[&<>"']/g, (m) => map[m]); |
||||||
|
} |
||||||
@ -1,148 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
var __importDefault = (this && this.__importDefault) || function (mod) { |
|
||||||
return (mod && mod.__esModule) ? mod : { "default": mod }; |
|
||||||
}; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.processAsciidoc = processAsciidoc; |
|
||||||
const core_1 = __importDefault(require("@asciidoctor/core")); |
|
||||||
const html_utils_1 = require("./html-utils"); |
|
||||||
const html_postprocess_1 = require("./html-postprocess"); |
|
||||||
const asciidoctorInstance = (0, core_1.default)(); |
|
||||||
/** |
|
||||||
* Processes AsciiDoc content to HTML using AsciiDoctor |
|
||||||
* Uses AsciiDoctor's built-in highlight.js and LaTeX support |
|
||||||
*/ |
|
||||||
async function processAsciidoc(content, options = {}) { |
|
||||||
const { enableCodeHighlighting = true, enableLaTeX = true, enableMusicalNotation = true, } = options; |
|
||||||
// Check if content starts with level 3+ headers
|
|
||||||
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
|
|
||||||
// If content starts with level 3+, use book doctype
|
|
||||||
const firstHeaderMatch = content.match(/^(={1,6})\s+/m); |
|
||||||
let doctype = 'article'; |
|
||||||
if (firstHeaderMatch) { |
|
||||||
const firstHeaderLevel = firstHeaderMatch[1].length; |
|
||||||
if (firstHeaderLevel >= 3) { |
|
||||||
doctype = 'book'; |
|
||||||
} |
|
||||||
} |
|
||||||
try { |
|
||||||
const result = asciidoctorInstance.convert(content, { |
|
||||||
safe: 'safe', |
|
||||||
backend: 'html5', |
|
||||||
doctype: doctype, |
|
||||||
attributes: { |
|
||||||
'showtitle': true, |
|
||||||
'sectanchors': true, |
|
||||||
'sectlinks': true, |
|
||||||
'toc': 'left', |
|
||||||
'toclevels': 6, |
|
||||||
'toc-title': 'Table of Contents', |
|
||||||
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none', |
|
||||||
'stem': enableLaTeX ? 'latexmath' : 'none', |
|
||||||
'plantuml': 'plantuml', // Enable PlantUML diagram support
|
|
||||||
'data-uri': true, |
|
||||||
'imagesdir': '', |
|
||||||
'linkcss': false, |
|
||||||
'stylesheet': '', |
|
||||||
'stylesdir': '', |
|
||||||
'prewrap': true, |
|
||||||
'sectnums': false, |
|
||||||
'sectnumlevels': 6, |
|
||||||
'experimental': true, |
|
||||||
'compat-mode': false, |
|
||||||
'attribute-missing': 'warn', |
|
||||||
'attribute-undefined': 'warn', |
|
||||||
'skip-front-matter': true, |
|
||||||
'source-indent': 0, |
|
||||||
'indent': 0, |
|
||||||
'tabsize': 2, |
|
||||||
'tabwidth': 2, |
|
||||||
'hardbreaks': false, |
|
||||||
'paragraph-rewrite': 'normal', |
|
||||||
'sectids': true, |
|
||||||
'idprefix': '', |
|
||||||
'idseparator': '-', |
|
||||||
'sectidprefix': '', |
|
||||||
'sectidseparator': '-' |
|
||||||
} |
|
||||||
}); |
|
||||||
const htmlString = typeof result === 'string' ? result : result.toString(); |
|
||||||
// Extract table of contents from HTML
|
|
||||||
const { toc, contentWithoutTOC } = (0, html_utils_1.extractTOC)(htmlString); |
|
||||||
// Sanitize HTML to prevent XSS
|
|
||||||
const sanitized = (0, html_utils_1.sanitizeHTML)(contentWithoutTOC); |
|
||||||
// Post-process HTML: convert macros to HTML, add styling, etc.
|
|
||||||
const processed = (0, html_postprocess_1.postProcessHtml)(sanitized, { |
|
||||||
enableMusicalNotation, |
|
||||||
linkBaseURL: options.linkBaseURL, |
|
||||||
wikilinkUrl: options.wikilinkUrl, |
|
||||||
hashtagUrl: options.hashtagUrl, |
|
||||||
}); |
|
||||||
// Process links: add target="_blank" to external links
|
|
||||||
const processedWithLinks = options.linkBaseURL |
|
||||||
? (0, html_utils_1.processLinks)(processed, options.linkBaseURL) |
|
||||||
: processed; |
|
||||||
// Also process TOC
|
|
||||||
const tocSanitized = (0, html_utils_1.sanitizeHTML)(toc); |
|
||||||
const tocProcessed = (0, html_postprocess_1.postProcessHtml)(tocSanitized, { |
|
||||||
enableMusicalNotation: false, // Don't process music in TOC
|
|
||||||
linkBaseURL: options.linkBaseURL, |
|
||||||
wikilinkUrl: options.wikilinkUrl, |
|
||||||
hashtagUrl: options.hashtagUrl, |
|
||||||
}); |
|
||||||
// Process links in TOC as well
|
|
||||||
const tocProcessedWithLinks = options.linkBaseURL |
|
||||||
? (0, html_utils_1.processLinks)(tocProcessed, options.linkBaseURL) |
|
||||||
: tocProcessed; |
|
||||||
// Check for LaTeX in original content (more reliable than checking HTML)
|
|
||||||
const contentToCheck = options.originalContent || content; |
|
||||||
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck); |
|
||||||
// Check for musical notation in processed HTML
|
|
||||||
const hasMusicalNotation = enableMusicalNotation && (/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed)); |
|
||||||
return { |
|
||||||
content: processedWithLinks, |
|
||||||
tableOfContents: tocProcessedWithLinks, |
|
||||||
hasLaTeX, |
|
||||||
hasMusicalNotation, |
|
||||||
nostrLinks: [], // Will be populated by metadata extraction
|
|
||||||
wikilinks: [], |
|
||||||
hashtags: [], |
|
||||||
links: [], |
|
||||||
media: [], |
|
||||||
}; |
|
||||||
} |
|
||||||
catch (error) { |
|
||||||
// Fallback to plain text with error logging
|
|
||||||
const errorMessage = error instanceof Error ? error.message : String(error); |
|
||||||
// Use process.stderr.write for Node.js compatibility instead of console.error
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const nodeProcess = globalThis.process; |
|
||||||
if (nodeProcess?.stderr) { |
|
||||||
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`); |
|
||||||
} |
|
||||||
// Escape HTML in content for safe display
|
|
||||||
const escapedContent = (0, html_utils_1.sanitizeHTML)(content); |
|
||||||
return { |
|
||||||
content: `<p>${escapedContent}</p>`, |
|
||||||
tableOfContents: '', |
|
||||||
hasLaTeX: false, |
|
||||||
hasMusicalNotation: false, |
|
||||||
nostrLinks: [], |
|
||||||
wikilinks: [], |
|
||||||
hashtags: [], |
|
||||||
links: [], |
|
||||||
media: [], |
|
||||||
}; |
|
||||||
} |
|
||||||
} |
|
||||||
/** |
|
||||||
* Check if content has LaTeX math |
|
||||||
* Based on jumble's detection pattern |
|
||||||
*/ |
|
||||||
function hasMathContent(content) { |
|
||||||
// Check for inline math: $...$ or \(...\)
|
|
||||||
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content); |
|
||||||
// Check for block math: $$...$$ or \[...\]
|
|
||||||
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content); |
|
||||||
return inlineMath || blockMath; |
|
||||||
} |
|
||||||
@ -1,193 +1,56 @@ |
|||||||
import { ProcessResult } from '../types'; |
import asciidoctor from '@asciidoctor/core'; |
||||||
import { extractTOC, sanitizeHTML, processLinks } from './html-utils'; |
import { ParserOptions } from '../types'; |
||||||
import { postProcessHtml } from './html-postprocess'; |
import * as emoji from 'node-emoji'; |
||||||
|
|
||||||
// Lazy-load AsciiDoctor instance to avoid issues with Jest module transformation
|
export interface AsciiDocResult { |
||||||
// Use require() for CommonJS modules to avoid Jest transformation issues
|
html: string; |
||||||
let asciidoctorInstance: any = null; |
tableOfContents: string; |
||||||
|
hasLaTeX: boolean; |
||||||
function getAsciidoctorInstance() { |
hasMusicalNotation: boolean; |
||||||
if (!asciidoctorInstance) { |
|
||||||
// Use require() instead of import() to avoid Jest transformation issues with Opal runtime
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
||||||
const asciidoctor = require('@asciidoctor/core'); |
|
||||||
asciidoctorInstance = asciidoctor.default(); |
|
||||||
} |
|
||||||
return asciidoctorInstance; |
|
||||||
} |
|
||||||
|
|
||||||
export interface ProcessOptions { |
|
||||||
enableCodeHighlighting?: boolean; |
|
||||||
enableLaTeX?: boolean; |
|
||||||
enableMusicalNotation?: boolean; |
|
||||||
originalContent?: string; // Original content for LaTeX detection
|
|
||||||
linkBaseURL?: string; // Base URL for link processing
|
|
||||||
wikilinkUrl?: string | ((dtag: string) => string); // Custom URL format for wikilinks
|
|
||||||
hashtagUrl?: string | ((topic: string) => string); // Custom URL format for hashtags
|
|
||||||
} |
} |
||||||
|
|
||||||
/** |
/** |
||||||
* Processes AsciiDoc content to HTML using AsciiDoctor |
* Process AsciiDoc content to HTML |
||||||
* Uses AsciiDoctor's built-in highlight.js and LaTeX support |
|
||||||
*/ |
*/ |
||||||
export async function processAsciidoc( |
export function processAsciiDoc(content: string, options: ParserOptions): AsciiDocResult { |
||||||
content: string, |
const hasLaTeX = /\[source,latex\]|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content); |
||||||
options: ProcessOptions = {} |
const hasMusicalNotation = /\[abc\]|\[source,abc\]/i.test(content); |
||||||
): Promise<ProcessResult> { |
|
||||||
const { |
|
||||||
enableCodeHighlighting = true, |
|
||||||
enableLaTeX = true, |
|
||||||
enableMusicalNotation = true, |
|
||||||
} = options; |
|
||||||
|
|
||||||
// Check if content starts with level 3+ headers
|
|
||||||
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
|
|
||||||
// If content starts with level 3+, use book doctype
|
|
||||||
const firstHeaderMatch = content.match(/^(={1,6})\s+/m); |
|
||||||
let doctype: 'article' | 'book' = 'article'; |
|
||||||
|
|
||||||
if (firstHeaderMatch) { |
// Process emojis before AsciiDoc conversion
|
||||||
const firstHeaderLevel = firstHeaderMatch[1].length; |
const processedContent = emoji.emojify(content); |
||||||
if (firstHeaderLevel >= 3) { |
|
||||||
doctype = 'book'; |
const asciidoctorOptions: any = { |
||||||
} |
safe: 'unsafe', |
||||||
} |
attributes: { |
||||||
|
'showtitle': true, |
||||||
try { |
'icons': 'font', |
||||||
const instance = getAsciidoctorInstance(); |
'source-highlighter': options.enableCodeHighlighting !== false ? 'highlight.js' : undefined, |
||||||
const result = instance.convert(content, { |
'highlightjs-theme': 'github', |
||||||
safe: 'safe', |
'toc': 'left', |
||||||
backend: 'html5', |
'toclevels': 6, |
||||||
doctype: doctype, |
'sectanchors': true, |
||||||
attributes: { |
'sectlinks': true, |
||||||
'showtitle': true, |
'idprefix': '_', |
||||||
'sectanchors': true, |
'idseparator': '_' |
||||||
'sectlinks': true, |
|
||||||
'toc': 'left', |
|
||||||
'toclevels': 6, |
|
||||||
'toc-title': 'Table of Contents', |
|
||||||
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none', |
|
||||||
'stem': enableLaTeX ? 'latexmath' : 'none', |
|
||||||
'plantuml': 'plantuml', // Enable PlantUML diagram support
|
|
||||||
'data-uri': true, |
|
||||||
'imagesdir': '', |
|
||||||
'linkcss': false, |
|
||||||
'stylesheet': '', |
|
||||||
'stylesdir': '', |
|
||||||
'prewrap': true, |
|
||||||
'sectnums': false, |
|
||||||
'sectnumlevels': 6, |
|
||||||
'experimental': true, |
|
||||||
'compat-mode': false, |
|
||||||
'attribute-missing': 'warn', |
|
||||||
'attribute-undefined': 'warn', |
|
||||||
'skip-front-matter': true, |
|
||||||
'source-indent': 0, |
|
||||||
'indent': 0, |
|
||||||
'tabsize': 2, |
|
||||||
'tabwidth': 2, |
|
||||||
'hardbreaks': false, |
|
||||||
'paragraph-rewrite': 'normal', |
|
||||||
'sectids': true, |
|
||||||
'idprefix': '', |
|
||||||
'idseparator': '-', |
|
||||||
'sectidprefix': '', |
|
||||||
'sectidseparator': '-' |
|
||||||
} |
|
||||||
}); |
|
||||||
|
|
||||||
const htmlString = typeof result === 'string' ? result : result.toString(); |
|
||||||
|
|
||||||
// Extract table of contents from HTML
|
|
||||||
const { toc, contentWithoutTOC } = extractTOC(htmlString); |
|
||||||
|
|
||||||
// Sanitize HTML to prevent XSS
|
|
||||||
const sanitized = sanitizeHTML(contentWithoutTOC); |
|
||||||
|
|
||||||
// Post-process HTML: convert macros to HTML, add styling, etc.
|
|
||||||
const processed = postProcessHtml(sanitized, { |
|
||||||
enableMusicalNotation, |
|
||||||
linkBaseURL: options.linkBaseURL, |
|
||||||
wikilinkUrl: options.wikilinkUrl, |
|
||||||
hashtagUrl: options.hashtagUrl, |
|
||||||
}); |
|
||||||
|
|
||||||
// Process links: add target="_blank" to external links
|
|
||||||
const processedWithLinks = options.linkBaseURL
|
|
||||||
? processLinks(processed, options.linkBaseURL) |
|
||||||
: processed; |
|
||||||
|
|
||||||
// Also process TOC
|
|
||||||
const tocSanitized = sanitizeHTML(toc); |
|
||||||
const tocProcessed = postProcessHtml(tocSanitized, { |
|
||||||
enableMusicalNotation: false, // Don't process music in TOC
|
|
||||||
linkBaseURL: options.linkBaseURL, |
|
||||||
wikilinkUrl: options.wikilinkUrl, |
|
||||||
hashtagUrl: options.hashtagUrl, |
|
||||||
}); |
|
||||||
|
|
||||||
// Process links in TOC as well
|
|
||||||
const tocProcessedWithLinks = options.linkBaseURL |
|
||||||
? processLinks(tocProcessed, options.linkBaseURL) |
|
||||||
: tocProcessed; |
|
||||||
|
|
||||||
// Check for LaTeX in original content (more reliable than checking HTML)
|
|
||||||
const contentToCheck = options.originalContent || content; |
|
||||||
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck); |
|
||||||
|
|
||||||
// Check for musical notation in processed HTML
|
|
||||||
const hasMusicalNotation = enableMusicalNotation && ( |
|
||||||
/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed) |
|
||||||
); |
|
||||||
|
|
||||||
return { |
|
||||||
content: processedWithLinks, |
|
||||||
tableOfContents: tocProcessedWithLinks, |
|
||||||
hasLaTeX, |
|
||||||
hasMusicalNotation, |
|
||||||
nostrLinks: [], // Will be populated by metadata extraction
|
|
||||||
wikilinks: [], |
|
||||||
hashtags: [], |
|
||||||
links: [], |
|
||||||
media: [], |
|
||||||
}; |
|
||||||
} catch (error) { |
|
||||||
// Fallback to plain text with error logging
|
|
||||||
const errorMessage = error instanceof Error ? error.message : String(error); |
|
||||||
// Use process.stderr.write for Node.js compatibility instead of console.error
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const nodeProcess = (globalThis as any).process; |
|
||||||
if (nodeProcess?.stderr) { |
|
||||||
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`); |
|
||||||
} |
} |
||||||
|
}; |
||||||
// Escape HTML in content for safe display
|
|
||||||
const escapedContent = sanitizeHTML(content); |
// Convert to HTML
|
||||||
|
const Asciidoctor = asciidoctor(); |
||||||
return { |
const htmlResult = Asciidoctor.convert(processedContent, asciidoctorOptions); |
||||||
content: `<p>${escapedContent}</p>`, |
const html = typeof htmlResult === 'string' ? htmlResult : htmlResult.toString(); |
||||||
tableOfContents: '', |
|
||||||
hasLaTeX: false, |
// Extract table of contents if present
|
||||||
hasMusicalNotation: false, |
const tocMatch = html.match(/<div id="toc"[^>]*>([\s\S]*?)<\/div>/); |
||||||
nostrLinks: [], |
const tableOfContents = tocMatch ? tocMatch[1] : ''; |
||||||
wikilinks: [], |
|
||||||
hashtags: [], |
// Remove TOC from main content if present
|
||||||
links: [], |
const contentWithoutToc = html.replace(/<div id="toc"[^>]*>[\s\S]*?<\/div>/, ''); |
||||||
media: [], |
|
||||||
}; |
return { |
||||||
} |
html: contentWithoutToc, |
||||||
} |
tableOfContents, |
||||||
|
hasLaTeX, |
||||||
/** |
hasMusicalNotation |
||||||
* Check if content has LaTeX math |
}; |
||||||
* Based on jumble's detection pattern |
|
||||||
*/ |
|
||||||
function hasMathContent(content: string): boolean { |
|
||||||
// Check for inline math: $...$ or \(...\)
|
|
||||||
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content); |
|
||||||
|
|
||||||
// Check for block math: $$...$$ or \[...\]
|
|
||||||
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content); |
|
||||||
|
|
||||||
return inlineMath || blockMath; |
|
||||||
} |
} |
||||||
|
|||||||
@ -1,693 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.postProcessHtml = postProcessHtml; |
|
||||||
const music_1 = require("./music"); |
|
||||||
/** |
|
||||||
* Post-processes HTML output from AsciiDoctor |
|
||||||
* Converts AsciiDoc macros to HTML with data attributes and CSS classes |
|
||||||
*/ |
|
||||||
function postProcessHtml(html, options = {}) { |
|
||||||
let processed = html; |
|
||||||
// Convert bookstr markers to HTML placeholders
|
|
||||||
processed = processed.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => { |
|
||||||
const escaped = bookContent.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`; |
|
||||||
}); |
|
||||||
// Convert hashtag links to HTML
|
|
||||||
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => { |
|
||||||
// HTML escape the display text
|
|
||||||
const escapedDisplay = displayText |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
// If hashtagUrl is configured, make it a clickable link
|
|
||||||
if (options.hashtagUrl) { |
|
||||||
let url; |
|
||||||
if (typeof options.hashtagUrl === 'function') { |
|
||||||
url = options.hashtagUrl(normalizedHashtag); |
|
||||||
} |
|
||||||
else { |
|
||||||
// String template with {topic} placeholder
|
|
||||||
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag); |
|
||||||
} |
|
||||||
// Escape URL for HTML attribute
|
|
||||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${normalizedHashtag.replace(/"/g, '"')}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
|
||||||
} |
|
||||||
else { |
|
||||||
// Default: Use span instead of <a> tag - same color as links but no underline and not clickable
|
|
||||||
return `<span class="hashtag-link">${escapedDisplay}</span>`; |
|
||||||
} |
|
||||||
}); |
|
||||||
// Convert WIKILINK:dtag|display placeholder format to HTML
|
|
||||||
// Match WIKILINK:dtag|display, ensuring we don't match across HTML tags
|
|
||||||
processed = processed.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => { |
|
||||||
const escapedDtag = dTag.trim().replace(/"/g, '"'); |
|
||||||
const escapedDisplay = displayText.trim() |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
// Generate URL using custom format or default
|
|
||||||
let url; |
|
||||||
if (options.wikilinkUrl) { |
|
||||||
if (typeof options.wikilinkUrl === 'function') { |
|
||||||
url = options.wikilinkUrl(dTag.trim()); |
|
||||||
} |
|
||||||
else { |
|
||||||
// String template with {dtag} placeholder
|
|
||||||
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim()); |
|
||||||
} |
|
||||||
} |
|
||||||
else { |
|
||||||
// Default format
|
|
||||||
url = `/events?d=${escapedDtag}`; |
|
||||||
} |
|
||||||
// Escape URL for HTML attribute
|
|
||||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
|
||||||
}); |
|
||||||
// Convert any leftover link: macros that AsciiDoctor didn't convert
|
|
||||||
// This MUST run before processOpenGraphLinks which removes "link:" prefixes
|
|
||||||
// This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars)
|
|
||||||
// Pattern: link:url[text] where url is http/https and text can contain any characters
|
|
||||||
// Match link: macros that are still in the HTML as plain text (not converted by AsciiDoctor)
|
|
||||||
// Also handle HTML-escaped versions that might appear
|
|
||||||
processed = processed.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => { |
|
||||||
// Unescape if already HTML-escaped (but be careful not to unescape actual content)
|
|
||||||
let unescapedUrl = url; |
|
||||||
// Only unescape if it looks like it was escaped (contains & or ")
|
|
||||||
if (url.includes('&') || url.includes('"') || url.includes(''')) { |
|
||||||
unescapedUrl = url |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
} |
|
||||||
let unescapedText = text; |
|
||||||
// Only unescape if it looks like it was escaped
|
|
||||||
if (text.includes('&') || text.includes('<') || text.includes('>') || text.includes('"') || text.includes(''')) { |
|
||||||
unescapedText = text |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
} |
|
||||||
// Escape URL for HTML attribute (fresh escape, no double-escaping)
|
|
||||||
const escapedUrl = unescapedUrl |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
// Escape text content for HTML (fresh escape, no double-escaping)
|
|
||||||
const escapedText = unescapedText |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
|
|
||||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
|
||||||
if (isRelayUrl) { |
|
||||||
// Simple link without OpenGraph wrapper
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
} |
|
||||||
else { |
|
||||||
// Regular link - will be processed by OpenGraph handler if external
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
} |
|
||||||
}); |
|
||||||
// Convert nostr: links to HTML
|
|
||||||
processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => { |
|
||||||
const nostrType = getNostrType(bech32Id); |
|
||||||
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') { |
|
||||||
// Render as embedded event placeholder
|
|
||||||
const escaped = bech32Id.replace(/"/g, '"'); |
|
||||||
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`; |
|
||||||
} |
|
||||||
else if (nostrType === 'npub' || nostrType === 'nprofile') { |
|
||||||
// Render as user handle
|
|
||||||
const escaped = bech32Id.replace(/"/g, '"'); |
|
||||||
return `<span class="user-handle" data-pubkey="${escaped}">@${displayText}</span>`; |
|
||||||
} |
|
||||||
else { |
|
||||||
// Fallback to regular link
|
|
||||||
const escaped = bech32Id.replace(/"/g, '"'); |
|
||||||
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${displayText}</a>`; |
|
||||||
} |
|
||||||
}); |
|
||||||
// Process media URLs (YouTube, Spotify, video, audio)
|
|
||||||
processed = processMedia(processed); |
|
||||||
// Fix double-escaped quotes in href attributes FIRST (before any other processing)
|
|
||||||
// This fixes href=""url"" -> href="url"
|
|
||||||
processed = processed.replace(/href\s*=\s*["']"(https?:\/\/[^"']+)"["']/gi, (_match, url) => { |
|
||||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `href="${escapedUrl}"`; |
|
||||||
}); |
|
||||||
// Process OpenGraph links (external links that should have rich previews)
|
|
||||||
processed = processOpenGraphLinks(processed, options.linkBaseURL); |
|
||||||
// Process images: add max-width styling and data attributes
|
|
||||||
processed = processImages(processed); |
|
||||||
// Process musical notation if enabled
|
|
||||||
if (options.enableMusicalNotation) { |
|
||||||
processed = (0, music_1.processMusicalNotation)(processed); |
|
||||||
} |
|
||||||
// Clean up any escaped HTML that appears as text (e.g., <a href=...>)
|
|
||||||
// This can happen when AsciiDoctor escapes link macros that it couldn't parse
|
|
||||||
// Pattern: <a href="url">text</a> should be converted to actual HTML
|
|
||||||
// Use a more flexible pattern that handles text with special characters like ://
|
|
||||||
// Fix regular escaped HTML links
|
|
||||||
processed = processed.replace(/<a\s+href=["'](https?:\/\/[^"']+)["']\s*>([^<]+)<\/a>/gi, (_match, url, text) => { |
|
||||||
// Unescape the URL and text
|
|
||||||
const unescapedUrl = url |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
const unescapedText = text |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>'); |
|
||||||
// Re-escape properly for HTML
|
|
||||||
const escapedUrl = unescapedUrl |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
const escapedText = unescapedText |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>'); |
|
||||||
// Check if link text contains wss:// or ws:// - these are relay URLs
|
|
||||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
|
||||||
if (isRelayUrl) { |
|
||||||
// Simple link without OpenGraph wrapper
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
} |
|
||||||
else { |
|
||||||
// Regular link
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
} |
|
||||||
}); |
|
||||||
// Clean up any leftover markdown syntax
|
|
||||||
processed = cleanupMarkdown(processed); |
|
||||||
// Add styling classes
|
|
||||||
processed = addStylingClasses(processed); |
|
||||||
// Hide raw ToC text
|
|
||||||
processed = hideRawTocText(processed); |
|
||||||
return processed; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Get Nostr identifier type |
|
||||||
*/ |
|
||||||
function getNostrType(id) { |
|
||||||
if (id.startsWith('npub')) |
|
||||||
return 'npub'; |
|
||||||
if (id.startsWith('nprofile')) |
|
||||||
return 'nprofile'; |
|
||||||
if (id.startsWith('nevent')) |
|
||||||
return 'nevent'; |
|
||||||
if (id.startsWith('naddr')) |
|
||||||
return 'naddr'; |
|
||||||
if (id.startsWith('note')) |
|
||||||
return 'note'; |
|
||||||
return null; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Process media URLs (YouTube, Spotify, video, audio) |
|
||||||
* Converts MEDIA: placeholders to HTML embeds/players |
|
||||||
*/ |
|
||||||
function processMedia(html) { |
|
||||||
let processed = html; |
|
||||||
// Process YouTube embeds
|
|
||||||
processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => { |
|
||||||
const escapedId = videoId.replace(/"/g, '"'); |
|
||||||
return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
|
|
||||||
<iframe
|
|
||||||
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
|
|
||||||
src="https://www.youtube.com/embed/${escapedId}"
|
|
||||||
frameborder="0"
|
|
||||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
|
|
||||||
allowfullscreen |
|
||||||
loading="lazy"> |
|
||||||
</iframe> |
|
||||||
</div>`; |
|
||||||
}); |
|
||||||
// Process Spotify embeds
|
|
||||||
processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => { |
|
||||||
const escapedType = type.replace(/"/g, '"'); |
|
||||||
const escapedId = id.replace(/"/g, '"'); |
|
||||||
return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
|
|
||||||
<iframe
|
|
||||||
style="border-radius: 12px; width: 100%; max-width: 100%;"
|
|
||||||
src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
|
|
||||||
width="100%"
|
|
||||||
height="352"
|
|
||||||
frameborder="0"
|
|
||||||
allowfullscreen=""
|
|
||||||
allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
|
|
||||||
loading="lazy"> |
|
||||||
</iframe> |
|
||||||
</div>`; |
|
||||||
}); |
|
||||||
// Process video files
|
|
||||||
processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
|
||||||
const escapedUrl = url |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
return `<div class="media-embed video-embed" style="margin: 1rem 0;">
|
|
||||||
<video
|
|
||||||
controls
|
|
||||||
preload="metadata"
|
|
||||||
style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;" |
|
||||||
class="media-player"> |
|
||||||
<source src="${escapedUrl}" type="video/mp4"> |
|
||||||
Your browser does not support the video tag. |
|
||||||
</video> |
|
||||||
</div>`; |
|
||||||
}); |
|
||||||
// Process audio files
|
|
||||||
processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
|
||||||
const escapedUrl = url |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
|
|
||||||
<audio
|
|
||||||
controls
|
|
||||||
preload="metadata"
|
|
||||||
style="width: 100%; max-width: 100%;" |
|
||||||
class="media-player"> |
|
||||||
<source src="${escapedUrl}"> |
|
||||||
Your browser does not support the audio tag. |
|
||||||
</audio> |
|
||||||
</div>`; |
|
||||||
}); |
|
||||||
return processed; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Process OpenGraph links - mark external links for OpenGraph preview fetching |
|
||||||
*/ |
|
||||||
function processOpenGraphLinks(html, linkBaseURL) { |
|
||||||
// First, clean up any corrupted HTML fragments that might interfere
|
|
||||||
// Remove "link:" prefixes that appear before links (AsciiDoc syntax that shouldn't be in HTML)
|
|
||||||
// This happens when AsciiDoctor doesn't fully convert link:url[text] syntax or when
|
|
||||||
// there's literal text like "should render like link:" before an anchor tag
|
|
||||||
let processed = html; |
|
||||||
// Remove "link:" that appears immediately before anchor tags (most common case)
|
|
||||||
// Match "link:" followed by optional whitespace and then <a
|
|
||||||
processed = processed.replace(/link:\s*<a/gi, '<a'); |
|
||||||
// Remove "link:" that appears as plain text in HTML (shouldn't be there)
|
|
||||||
// Be careful not to match "link:" inside HTML attributes or tags
|
|
||||||
// Match "link:" that's not inside quotes or tags
|
|
||||||
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2'); |
|
||||||
// Also handle cases where "link:" appears with whitespace before anchor tags
|
|
||||||
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' '); |
|
||||||
// Clean up any corrupted href attributes that contain HTML fragments or double-escaped quotes
|
|
||||||
// Fix href attributes with escaped quotes: href=""url"" -> href="url"
|
|
||||||
processed = processed.replace(/href\s*=\s*["']"(https?:\/\/[^"']+)"["']/gi, (match, url) => { |
|
||||||
// Extract the clean URL and properly escape it
|
|
||||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `href="${escapedUrl}"`; |
|
||||||
}); |
|
||||||
// Clean up href attributes that contain HTML fragments
|
|
||||||
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => { |
|
||||||
// If href contains HTML tags, extract just the URL part
|
|
||||||
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i); |
|
||||||
if (urlMatch) { |
|
||||||
const escapedUrl = urlMatch[1].replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `href="${escapedUrl}"`; |
|
||||||
} |
|
||||||
return match; // If we can't fix it, leave it (will be skipped by validation)
|
|
||||||
}); |
|
||||||
// Clean up any malformed anchor tag fragments that might cause issues
|
|
||||||
processed = processed.replace(/<a\s+href=["']([^"'>]*<[^"'>]*)["']/gi, (match, corruptedHref) => { |
|
||||||
// Skip corrupted anchor tags - they'll be handled by the main regex with validation
|
|
||||||
return match; |
|
||||||
}); |
|
||||||
// Clean up links inside code blocks - AsciiDoctor creates them but they should be plain text
|
|
||||||
// Remove <a> tags inside <code> blocks, keeping only the link text
|
|
||||||
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match, content) => { |
|
||||||
// Remove any <a> tags inside code blocks, keeping only the text content
|
|
||||||
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1'); |
|
||||||
return `<code>${cleaned}</code>`; |
|
||||||
}); |
|
||||||
// Also clean up links inside pre blocks
|
|
||||||
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match, content) => { |
|
||||||
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1'); |
|
||||||
return `<pre>${cleaned}</pre>`; |
|
||||||
}); |
|
||||||
// Now protect code blocks and pre blocks by replacing them with placeholders
|
|
||||||
const codeBlockPlaceholders = []; |
|
||||||
const preBlockPlaceholders = []; |
|
||||||
// Replace pre blocks first (they can contain code blocks)
|
|
||||||
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => { |
|
||||||
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`; |
|
||||||
preBlockPlaceholders.push(match); |
|
||||||
return placeholder; |
|
||||||
}); |
|
||||||
// Replace code blocks
|
|
||||||
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => { |
|
||||||
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`; |
|
||||||
codeBlockPlaceholders.push(match); |
|
||||||
return placeholder; |
|
||||||
}); |
|
||||||
// Extract base domain from linkBaseURL if provided
|
|
||||||
let baseDomain = null; |
|
||||||
if (linkBaseURL) { |
|
||||||
try { |
|
||||||
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
|
||||||
if (urlMatch) { |
|
||||||
baseDomain = urlMatch[1]; |
|
||||||
} |
|
||||||
} |
|
||||||
catch { |
|
||||||
// Ignore parsing errors
|
|
||||||
} |
|
||||||
} |
|
||||||
// Before processing, remove any corrupted opengraph containers that might have been created
|
|
||||||
// These have malformed data-og-url attributes containing HTML fragments
|
|
||||||
// Match all spans with data-og-url and check if they're corrupted
|
|
||||||
// Use a pattern that matches spans with data-og-url, then check the attribute value
|
|
||||||
processed = processed.replace(/<span[^>]*data-og-url=["']([^"']+)["'][^>]*>[\s\S]*?<\/span>/gi, (match) => { |
|
||||||
// This span has a corrupted data-og-url (contains <)
|
|
||||||
// Extract the clean URL from the beginning of the attribute value
|
|
||||||
const dataOgUrlMatch = match.match(/data-og-url=["']([^"']+)["']/i); |
|
||||||
if (dataOgUrlMatch && dataOgUrlMatch[1]) { |
|
||||||
// Extract just the URL part (everything before the first <)
|
|
||||||
const urlMatch = dataOgUrlMatch[1].match(/(https?:\/\/[^\s<>"']+)/i); |
|
||||||
if (urlMatch) { |
|
||||||
const cleanUrl = urlMatch[1]; |
|
||||||
// Extract the link text from inside the span
|
|
||||||
const linkMatch = match.match(/<a[^>]*>(.*?)<\/a>/i); |
|
||||||
const linkText = linkMatch ? linkMatch[1] : cleanUrl; |
|
||||||
// Return a clean opengraph container with the fixed URL
|
|
||||||
const escapedUrl = cleanUrl.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
|
|
||||||
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a> |
|
||||||
<div class="opengraph-preview" data-og-loading="true" style="display: none;"> |
|
||||||
<div class="opengraph-card"> |
|
||||||
<div class="opengraph-image-container"> |
|
||||||
<img class="opengraph-image" src="" alt="" style="display: none;" /> |
|
||||||
</div> |
|
||||||
<div class="opengraph-content"> |
|
||||||
<div class="opengraph-site"></div> |
|
||||||
<div class="opengraph-title"></div> |
|
||||||
<div class="opengraph-description"></div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
</span>`; |
|
||||||
} |
|
||||||
// If we can't extract a clean URL, just remove the corrupted span and keep any text
|
|
||||||
const textMatch = match.match(/>([^<]+)</); |
|
||||||
return textMatch ? textMatch[1] : ''; |
|
||||||
} |
|
||||||
return match; // Keep valid spans
|
|
||||||
}); |
|
||||||
// Match external links (http/https) that aren't media, nostr, or wikilinks
|
|
||||||
// Skip links that are already in media embeds or special containers
|
|
||||||
// Use a stricter regex that only matches valid, complete anchor tags
|
|
||||||
// The regex must match a complete <a> tag with proper structure
|
|
||||||
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => { |
|
||||||
// CRITICAL: Validate href FIRST - if it contains ANY HTML tags or fragments, skip immediately
|
|
||||||
// This prevents corrupted HTML from being created
|
|
||||||
if (!href) { |
|
||||||
return match; // Skip if no href
|
|
||||||
} |
|
||||||
// Skip if href contains HTML tags or looks corrupted - be very strict
|
|
||||||
// Check for common HTML fragments that indicate corruption
|
|
||||||
if (href.includes('<') || href.includes('>') || href.includes('href=') || href.includes('</a>') || href.includes('<a') || href.includes('"') || href.includes("'")) { |
|
||||||
return match; // Skip if href looks corrupted
|
|
||||||
} |
|
||||||
// Additional validation: href should only contain URL-safe characters
|
|
||||||
// URLs shouldn't contain unescaped quotes or HTML tags
|
|
||||||
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) { |
|
||||||
return match; // Skip if href doesn't match clean URL pattern
|
|
||||||
} |
|
||||||
// Validate href is a proper URL (starts with http:// or https:// and doesn't contain invalid chars)
|
|
||||||
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) { |
|
||||||
return match; // Skip if href doesn't match URL pattern
|
|
||||||
} |
|
||||||
// Skip if the match contains unclosed tags or corrupted HTML
|
|
||||||
const openATags = (match.match(/<a\s/g) || []).length; |
|
||||||
const closeATags = (match.match(/<\/a>/g) || []).length; |
|
||||||
if (openATags !== closeATags || openATags !== 1) { |
|
||||||
return match; // Multiple or mismatched <a> tags = corrupted
|
|
||||||
} |
|
||||||
// Skip if match contains nested HTML that looks corrupted
|
|
||||||
if (match.includes('href="') && match.split('href="').length > 2) { |
|
||||||
return match; // Multiple href attributes = corrupted
|
|
||||||
} |
|
||||||
// Skip if it's already a media embed, nostr link, wikilink, or opengraph link
|
|
||||||
if (match.includes('class="wikilink"') || |
|
||||||
match.includes('class="nostr-link"') || |
|
||||||
match.includes('class="opengraph-link"') || |
|
||||||
match.includes('data-embedded-note') || |
|
||||||
match.includes('youtube-embed') || |
|
||||||
match.includes('spotify-embed') || |
|
||||||
match.includes('media-embed') || |
|
||||||
match.includes('opengraph-link-container')) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
// Skip if it's a media file URL
|
|
||||||
if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
// Skip if it's YouTube or Spotify (already handled as media)
|
|
||||||
if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
|
|
||||||
// They don't need OpenGraph previews
|
|
||||||
if (/wss?:\/\//i.test(linkText)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
// Check if it's an external link (not same domain)
|
|
||||||
let isExternal = true; |
|
||||||
if (baseDomain) { |
|
||||||
try { |
|
||||||
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/); |
|
||||||
if (hrefMatch && hrefMatch[1] === baseDomain) { |
|
||||||
isExternal = false; |
|
||||||
} |
|
||||||
} |
|
||||||
catch { |
|
||||||
// If parsing fails, assume external
|
|
||||||
} |
|
||||||
} |
|
||||||
// Only process external links
|
|
||||||
if (!isExternal) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
// Escape the URL for data attribute
|
|
||||||
const escapedUrl = href |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
// Add data attribute for OpenGraph fetching and wrap in container
|
|
||||||
// The actual OpenGraph fetching will be done client-side via JavaScript
|
|
||||||
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
|
|
||||||
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a> |
|
||||||
<div class="opengraph-preview" data-og-loading="true" style="display: none;"> |
|
||||||
<div class="opengraph-card"> |
|
||||||
<div class="opengraph-image-container"> |
|
||||||
<img class="opengraph-image" src="" alt="" style="display: none;" /> |
|
||||||
</div> |
|
||||||
<div class="opengraph-content"> |
|
||||||
<div class="opengraph-site"></div> |
|
||||||
<div class="opengraph-title"></div> |
|
||||||
<div class="opengraph-description"></div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
</span>`; |
|
||||||
}); |
|
||||||
// Restore code blocks
|
|
||||||
codeBlockPlaceholders.forEach((codeBlock, index) => { |
|
||||||
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock); |
|
||||||
}); |
|
||||||
// Restore pre blocks
|
|
||||||
preBlockPlaceholders.forEach((preBlock, index) => { |
|
||||||
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock); |
|
||||||
}); |
|
||||||
return processed; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Process images: add max-width styling and data attributes |
|
||||||
*/ |
|
||||||
function processImages(html) { |
|
||||||
const imageUrls = []; |
|
||||||
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi; |
|
||||||
let match; |
|
||||||
while ((match = imageUrlRegex.exec(html)) !== null) { |
|
||||||
const url = match[1]; |
|
||||||
if (url && !imageUrls.includes(url)) { |
|
||||||
imageUrls.push(url); |
|
||||||
} |
|
||||||
} |
|
||||||
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => { |
|
||||||
const srcMatch = attributes.match(/src=["']([^"']+)["']/i); |
|
||||||
if (!srcMatch) |
|
||||||
return imgTag; |
|
||||||
const src = srcMatch[1]; |
|
||||||
const currentIndex = imageUrls.indexOf(src); |
|
||||||
let updatedAttributes = attributes; |
|
||||||
if (updatedAttributes.match(/class=["']/i)) { |
|
||||||
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => { |
|
||||||
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim(); |
|
||||||
const newClasses = cleanedClasses |
|
||||||
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in` |
|
||||||
: 'max-w-[400px] object-contain cursor-zoom-in'; |
|
||||||
return `class="${newClasses}"`; |
|
||||||
}); |
|
||||||
} |
|
||||||
else { |
|
||||||
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`; |
|
||||||
} |
|
||||||
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '"')}"`; |
|
||||||
return `<img${updatedAttributes}>`; |
|
||||||
}); |
|
||||||
} |
|
||||||
/** |
|
||||||
* Clean URL by removing tracking parameters |
|
||||||
* Based on jumble's cleanUrl function |
|
||||||
*/ |
|
||||||
function cleanUrl(url) { |
|
||||||
try { |
|
||||||
const parsedUrl = new URL(url); |
|
||||||
// List of tracking parameter prefixes and exact names to remove
|
|
||||||
const trackingParams = [ |
|
||||||
// Google Analytics & Ads
|
|
||||||
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', |
|
||||||
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic', |
|
||||||
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid', |
|
||||||
// Facebook
|
|
||||||
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref', |
|
||||||
// Twitter/X
|
|
||||||
'twclid', 'twsrc', |
|
||||||
// Microsoft/Bing
|
|
||||||
'msclkid', 'mc_cid', 'mc_eid', |
|
||||||
// Adobe
|
|
||||||
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid', |
|
||||||
// Mailchimp
|
|
||||||
'mc_cid', 'mc_eid', |
|
||||||
// HubSpot
|
|
||||||
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver', |
|
||||||
// Marketo
|
|
||||||
'mkt_tok', |
|
||||||
// YouTube
|
|
||||||
'si', 'feature', 'kw', 'pp', |
|
||||||
// Other common tracking
|
|
||||||
'ref', 'referrer', 'source', 'campaign', 'medium', 'content', |
|
||||||
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd', |
|
||||||
// Mobile app tracking
|
|
||||||
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative', |
|
||||||
// Amazon
|
|
||||||
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag', |
|
||||||
// Affiliate tracking
|
|
||||||
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer', |
|
||||||
// Social media share tracking
|
|
||||||
'share', 'shared', 'sharesource' |
|
||||||
]; |
|
||||||
// Remove all tracking parameters
|
|
||||||
trackingParams.forEach(param => { |
|
||||||
parsedUrl.searchParams.delete(param); |
|
||||||
}); |
|
||||||
// Remove any parameter that starts with utm_ or _
|
|
||||||
Array.from(parsedUrl.searchParams.keys()).forEach(key => { |
|
||||||
if (key.startsWith('utm_') || key.startsWith('_')) { |
|
||||||
parsedUrl.searchParams.delete(key); |
|
||||||
} |
|
||||||
}); |
|
||||||
return parsedUrl.toString(); |
|
||||||
} |
|
||||||
catch { |
|
||||||
// If URL parsing fails, return original URL
|
|
||||||
return url; |
|
||||||
} |
|
||||||
} |
|
||||||
/** |
|
||||||
* Clean up leftover markdown syntax |
|
||||||
*/ |
|
||||||
function cleanupMarkdown(html) { |
|
||||||
let cleaned = html; |
|
||||||
// Clean up markdown image syntax
|
|
||||||
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => { |
|
||||||
const altText = alt || ''; |
|
||||||
// Clean URL (remove tracking parameters)
|
|
||||||
const cleanedUrl = cleanUrl(url); |
|
||||||
// Escape for HTML attribute
|
|
||||||
const escapedUrl = cleanedUrl.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`; |
|
||||||
}); |
|
||||||
// Clean up markdown link syntax
|
|
||||||
// Skip if the link is already inside an HTML tag or is part of escaped HTML
|
|
||||||
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => { |
|
||||||
// Skip if this markdown link is already inside an HTML tag
|
|
||||||
// Check if there's an <a> tag nearby that might have been created from this
|
|
||||||
if (cleaned.includes(`href="${url}"`) || cleaned.includes(`href='${url}'`)) { |
|
||||||
return _match; |
|
||||||
} |
|
||||||
// Skip if the text contains HTML entities or looks like it's already processed
|
|
||||||
if (text.includes('<') || text.includes('>') || text.includes('&')) { |
|
||||||
return _match; |
|
||||||
} |
|
||||||
// Skip if the URL is already in an href attribute (check for escaped versions too)
|
|
||||||
const escapedUrl = url.replace(/"/g, '"').replace(/'/g, '''); |
|
||||||
if (cleaned.includes(`href="${escapedUrl}"`) || cleaned.includes(`href='${escapedUrl}'`)) { |
|
||||||
return _match; |
|
||||||
} |
|
||||||
// Clean URL (remove tracking parameters)
|
|
||||||
const cleanedUrl = cleanUrl(url); |
|
||||||
// Escape for HTML attribute (but don't double-escape)
|
|
||||||
const finalEscapedUrl = cleanedUrl |
|
||||||
.replace(/&/g, '&') // Unescape if already escaped
|
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
// Escape text for HTML (but don't double-escape)
|
|
||||||
const escapedText = text |
|
||||||
.replace(/&/g, '&') // Unescape if already escaped
|
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>'); |
|
||||||
return `<a href="${finalEscapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
}); |
|
||||||
return cleaned; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Add proper CSS classes for styling |
|
||||||
*/ |
|
||||||
function addStylingClasses(html) { |
|
||||||
let styled = html; |
|
||||||
// Add strikethrough styling
|
|
||||||
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>'); |
|
||||||
// Add subscript styling
|
|
||||||
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>'); |
|
||||||
// Add superscript styling
|
|
||||||
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>'); |
|
||||||
// Add code highlighting classes
|
|
||||||
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">'); |
|
||||||
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">'); |
|
||||||
return styled; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Hide raw AsciiDoc ToC text |
|
||||||
*/ |
|
||||||
function hideRawTocText(html) { |
|
||||||
let cleaned = html; |
|
||||||
cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, ''); |
|
||||||
return cleaned; |
|
||||||
} |
|
||||||
@ -1,599 +0,0 @@ |
|||||||
import { processMusicalNotation } from './music'; |
|
||||||
|
|
||||||
export interface PostProcessOptions { |
|
||||||
enableMusicalNotation?: boolean; |
|
||||||
linkBaseURL?: string; |
|
||||||
/** Custom URL format for wikilinks */ |
|
||||||
wikilinkUrl?: string | ((dtag: string) => string); |
|
||||||
/** Custom URL format for hashtags */ |
|
||||||
hashtagUrl?: string | ((topic: string) => string); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Post-processes HTML output from AsciiDoctor |
|
||||||
*
|
|
||||||
* Processing order (critical for correct rendering): |
|
||||||
* 1. Convert placeholders to HTML (BOOKSTR, hashtags, wikilinks, nostr links, media, link macros) |
|
||||||
* 2. Fix corrupted HTML (double-escaped quotes, escaped HTML as text, broken links) |
|
||||||
* 3. Process OpenGraph links (external links with previews) |
|
||||||
* 4. Process images (add styling) |
|
||||||
* 5. Process musical notation |
|
||||||
* 6. Clean up leftover markdown syntax |
|
||||||
* 7. Add styling classes |
|
||||||
* 8. Hide raw ToC text |
|
||||||
*/ |
|
||||||
export function postProcessHtml(html: string, options: PostProcessOptions = {}): string { |
|
||||||
let processed = html; |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 1: Convert placeholders to HTML
|
|
||||||
// ============================================
|
|
||||||
processed = convertBookstrMarkers(processed); |
|
||||||
processed = convertHashtags(processed, options); |
|
||||||
processed = convertWikilinks(processed, options); |
|
||||||
processed = convertNostrLinks(processed); |
|
||||||
processed = convertMediaPlaceholders(processed); |
|
||||||
processed = convertLinkMacros(processed); |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 2: Fix corrupted HTML
|
|
||||||
// ============================================
|
|
||||||
processed = fixDoubleEscapedQuotes(processed); |
|
||||||
processed = fixEscapedHtmlLinks(processed); |
|
||||||
processed = fixBrokenLinkPatterns(processed); |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 3: Process OpenGraph links
|
|
||||||
// ============================================
|
|
||||||
processed = processOpenGraphLinks(processed, options.linkBaseURL); |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 4: Process images
|
|
||||||
// ============================================
|
|
||||||
processed = processImages(processed); |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 5: Process musical notation
|
|
||||||
// ============================================
|
|
||||||
if (options.enableMusicalNotation) { |
|
||||||
processed = processMusicalNotation(processed); |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 6: Clean up leftover markdown
|
|
||||||
// ============================================
|
|
||||||
processed = cleanupMarkdown(processed); |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 7: Add styling classes
|
|
||||||
// ============================================
|
|
||||||
processed = addStylingClasses(processed); |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 8: Hide raw ToC text
|
|
||||||
// ============================================
|
|
||||||
processed = hideRawTocText(processed); |
|
||||||
|
|
||||||
return processed; |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 1: Convert placeholders to HTML
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Convert BOOKSTR markers to HTML placeholders |
|
||||||
*/ |
|
||||||
function convertBookstrMarkers(html: string): string { |
|
||||||
return html.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => { |
|
||||||
const escaped = escapeHtmlAttr(bookContent); |
|
||||||
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert hashtag placeholders to HTML |
|
||||||
*/ |
|
||||||
function convertHashtags(html: string, options: PostProcessOptions): string { |
|
||||||
return html.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => { |
|
||||||
const escapedDisplay = escapeHtml(displayText); |
|
||||||
|
|
||||||
if (options.hashtagUrl) { |
|
||||||
let url: string; |
|
||||||
if (typeof options.hashtagUrl === 'function') { |
|
||||||
url = options.hashtagUrl(normalizedHashtag); |
|
||||||
} else { |
|
||||||
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag); |
|
||||||
} |
|
||||||
|
|
||||||
const escapedUrl = escapeHtmlAttr(url); |
|
||||||
const escapedTopic = escapeHtmlAttr(normalizedHashtag); |
|
||||||
|
|
||||||
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${escapedTopic}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
|
||||||
} else { |
|
||||||
return `<span class="hashtag-link">${escapedDisplay}</span>`; |
|
||||||
} |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert wikilink placeholders to HTML |
|
||||||
*/ |
|
||||||
function convertWikilinks(html: string, options: PostProcessOptions): string { |
|
||||||
return html.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => { |
|
||||||
const escapedDtag = escapeHtmlAttr(dTag.trim()); |
|
||||||
const escapedDisplay = escapeHtml(displayText.trim()); |
|
||||||
|
|
||||||
let url: string; |
|
||||||
if (options.wikilinkUrl) { |
|
||||||
if (typeof options.wikilinkUrl === 'function') { |
|
||||||
url = options.wikilinkUrl(dTag.trim()); |
|
||||||
} else { |
|
||||||
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim()); |
|
||||||
} |
|
||||||
} else { |
|
||||||
url = `/events?d=${escapedDtag}`; |
|
||||||
} |
|
||||||
|
|
||||||
const escapedUrl = escapeHtmlAttr(url); |
|
||||||
|
|
||||||
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert nostr: links to HTML |
|
||||||
*/ |
|
||||||
function convertNostrLinks(html: string): string { |
|
||||||
return html.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => { |
|
||||||
const nostrType = getNostrType(bech32Id); |
|
||||||
const escaped = escapeHtmlAttr(bech32Id); |
|
||||||
const escapedDisplay = escapeHtml(displayText); |
|
||||||
|
|
||||||
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') { |
|
||||||
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`; |
|
||||||
} else if (nostrType === 'npub' || nostrType === 'nprofile') { |
|
||||||
return `<span class="user-handle" data-pubkey="${escaped}">@${escapedDisplay}</span>`; |
|
||||||
} else { |
|
||||||
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${escapedDisplay}</a>`; |
|
||||||
} |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Get Nostr identifier type |
|
||||||
*/ |
|
||||||
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null { |
|
||||||
if (id.startsWith('npub')) return 'npub'; |
|
||||||
if (id.startsWith('nprofile')) return 'nprofile'; |
|
||||||
if (id.startsWith('nevent')) return 'nevent'; |
|
||||||
if (id.startsWith('naddr')) return 'naddr'; |
|
||||||
if (id.startsWith('note')) return 'note'; |
|
||||||
return null; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert media placeholders to HTML embeds |
|
||||||
*/ |
|
||||||
function convertMediaPlaceholders(html: string): string { |
|
||||||
let processed = html; |
|
||||||
|
|
||||||
// YouTube embeds
|
|
||||||
processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => { |
|
||||||
const escapedId = escapeHtmlAttr(videoId); |
|
||||||
return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
|
|
||||||
<iframe
|
|
||||||
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
|
|
||||||
src="https://www.youtube.com/embed/${escapedId}"
|
|
||||||
frameborder="0"
|
|
||||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
|
|
||||||
allowfullscreen |
|
||||||
loading="lazy"> |
|
||||||
</iframe> |
|
||||||
</div>`;
|
|
||||||
}); |
|
||||||
|
|
||||||
// Spotify embeds
|
|
||||||
processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => { |
|
||||||
const escapedType = escapeHtmlAttr(type); |
|
||||||
const escapedId = escapeHtmlAttr(id); |
|
||||||
return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
|
|
||||||
<iframe
|
|
||||||
style="border-radius: 12px; width: 100%; max-width: 100%;"
|
|
||||||
src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
|
|
||||||
width="100%"
|
|
||||||
height="352"
|
|
||||||
frameborder="0"
|
|
||||||
allowfullscreen=""
|
|
||||||
allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
|
|
||||||
loading="lazy"> |
|
||||||
</iframe> |
|
||||||
</div>`;
|
|
||||||
}); |
|
||||||
|
|
||||||
// Video files
|
|
||||||
processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
|
||||||
const escapedUrl = escapeHtmlAttr(url); |
|
||||||
return `<div class="media-embed video-embed" style="margin: 1rem 0;">
|
|
||||||
<video
|
|
||||||
controls
|
|
||||||
preload="metadata"
|
|
||||||
style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;" |
|
||||||
class="media-player"> |
|
||||||
<source src="${escapedUrl}" type="video/mp4"> |
|
||||||
Your browser does not support the video tag. |
|
||||||
</video> |
|
||||||
</div>`;
|
|
||||||
}); |
|
||||||
|
|
||||||
// Audio files
|
|
||||||
processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => { |
|
||||||
const escapedUrl = escapeHtmlAttr(url); |
|
||||||
return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
|
|
||||||
<audio
|
|
||||||
controls
|
|
||||||
preload="metadata"
|
|
||||||
style="width: 100%; max-width: 100%;" |
|
||||||
class="media-player"> |
|
||||||
<source src="${escapedUrl}"> |
|
||||||
Your browser does not support the audio tag. |
|
||||||
</audio> |
|
||||||
</div>`;
|
|
||||||
}); |
|
||||||
|
|
||||||
return processed; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Convert link: macros that AsciiDoctor didn't convert |
|
||||||
* This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars) |
|
||||||
*/ |
|
||||||
function convertLinkMacros(html: string): string { |
|
||||||
return html.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => { |
|
||||||
// Unescape if already HTML-escaped
|
|
||||||
const unescapedUrl = unescapeHtml(url); |
|
||||||
const unescapedText = unescapeHtml(text); |
|
||||||
|
|
||||||
// Re-escape properly for HTML
|
|
||||||
const escapedUrl = escapeHtmlAttr(unescapedUrl); |
|
||||||
const escapedText = escapeHtml(unescapedText); |
|
||||||
|
|
||||||
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
|
|
||||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
|
||||||
|
|
||||||
// Create link (OpenGraph processing will handle it later if needed)
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 2: Fix corrupted HTML
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Fix double-escaped quotes in href attributes: href=""url"" -> href="url" |
|
||||||
*/ |
|
||||||
function fixDoubleEscapedQuotes(html: string): string { |
|
||||||
return html.replace(/href\s*=\s*["']"(https?:\/\/[^"']+)"["']/gi, (_match, url) => { |
|
||||||
const escapedUrl = escapeHtmlAttr(url); |
|
||||||
return `href="${escapedUrl}"`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Fix escaped HTML links: <a href="...">text</a> -> <a href="...">text</a> |
|
||||||
*/ |
|
||||||
function fixEscapedHtmlLinks(html: string): string { |
|
||||||
return html.replace(/<a\s+href=["'](https?:\/\/[^"']+)["']\s*>([^<]+)<\/a>/gi, (_match, url, text) => { |
|
||||||
const unescapedUrl = unescapeHtml(url); |
|
||||||
const unescapedText = unescapeHtml(text); |
|
||||||
|
|
||||||
const escapedUrl = escapeHtmlAttr(unescapedUrl); |
|
||||||
const escapedText = escapeHtml(unescapedText); |
|
||||||
|
|
||||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
|
||||||
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Fix broken link patterns where attributes appear as text before escaped HTML |
|
||||||
* Pattern: " target=...><a href=...>text</a> |
|
||||||
*/ |
|
||||||
function fixBrokenLinkPatterns(html: string): string { |
|
||||||
return html.replace(/"\s+target=["'][^"']*["']\s+rel=["'][^"']*["']\s+class=["'][^"']*["']\s*><a\s+href=["'](https?:\/\/[^"']+)["']\s*>([^<]+)<\/a>/gi, (_match, url, text) => { |
|
||||||
const unescapedUrl = unescapeHtml(url); |
|
||||||
const unescapedText = unescapeHtml(text); |
|
||||||
|
|
||||||
const escapedUrl = escapeHtmlAttr(unescapedUrl); |
|
||||||
const escapedText = escapeHtml(unescapedText); |
|
||||||
|
|
||||||
const isRelayUrl = /wss?:\/\//i.test(unescapedText); |
|
||||||
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 3: Process OpenGraph links
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Process OpenGraph links - mark external links for OpenGraph preview fetching |
|
||||||
*/ |
|
||||||
function processOpenGraphLinks(html: string, linkBaseURL?: string): string { |
|
||||||
let processed = html; |
|
||||||
|
|
||||||
// Remove "link:" prefixes that might appear before anchor tags
|
|
||||||
processed = processed.replace(/link:\s*<a/gi, '<a'); |
|
||||||
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2'); |
|
||||||
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' '); |
|
||||||
|
|
||||||
// Clean up corrupted href attributes
|
|
||||||
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => { |
|
||||||
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i); |
|
||||||
if (urlMatch) { |
|
||||||
const escapedUrl = escapeHtmlAttr(urlMatch[1]); |
|
||||||
return `href="${escapedUrl}"`; |
|
||||||
} |
|
||||||
return match; |
|
||||||
}); |
|
||||||
|
|
||||||
// Protect code blocks and pre blocks
|
|
||||||
const codeBlockPlaceholders: string[] = []; |
|
||||||
const preBlockPlaceholders: string[] = []; |
|
||||||
|
|
||||||
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => { |
|
||||||
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`; |
|
||||||
preBlockPlaceholders.push(match); |
|
||||||
return placeholder; |
|
||||||
}); |
|
||||||
|
|
||||||
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => { |
|
||||||
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`; |
|
||||||
codeBlockPlaceholders.push(match); |
|
||||||
return placeholder; |
|
||||||
}); |
|
||||||
|
|
||||||
// Extract base domain
|
|
||||||
let baseDomain: string | null = null; |
|
||||||
if (linkBaseURL) { |
|
||||||
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
|
||||||
if (urlMatch) { |
|
||||||
baseDomain = urlMatch[1]; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Process external links
|
|
||||||
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => { |
|
||||||
// Validate href
|
|
||||||
if (!href || href.includes('<') || href.includes('>') || !/^https?:\/\/[^\s<>"']+$/i.test(href)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Skip if already processed
|
|
||||||
if (match.includes('class="wikilink"') ||
|
|
||||||
match.includes('class="nostr-link"') || |
|
||||||
match.includes('class="opengraph-link"') || |
|
||||||
match.includes('data-embedded-note') || |
|
||||||
match.includes('media-embed') || |
|
||||||
match.includes('opengraph-link-container')) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Skip media files
|
|
||||||
if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Skip YouTube/Spotify (already handled as media)
|
|
||||||
if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
|
|
||||||
if (/wss?:\/\//i.test(linkText)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Check if external
|
|
||||||
let isExternal = true; |
|
||||||
if (baseDomain) { |
|
||||||
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/); |
|
||||||
if (hrefMatch && hrefMatch[1] === baseDomain) { |
|
||||||
isExternal = false; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if (!isExternal) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Wrap in OpenGraph container
|
|
||||||
const escapedUrl = escapeHtmlAttr(href); |
|
||||||
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
|
|
||||||
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a> |
|
||||||
<div class="opengraph-preview" data-og-loading="true" style="display: none;"> |
|
||||||
<div class="opengraph-card"> |
|
||||||
<div class="opengraph-image-container"> |
|
||||||
<img class="opengraph-image" src="" alt="" style="display: none;" /> |
|
||||||
</div> |
|
||||||
<div class="opengraph-content"> |
|
||||||
<div class="opengraph-site"></div> |
|
||||||
<div class="opengraph-title"></div> |
|
||||||
<div class="opengraph-description"></div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
</span>`;
|
|
||||||
}); |
|
||||||
|
|
||||||
// Restore code blocks
|
|
||||||
codeBlockPlaceholders.forEach((codeBlock, index) => { |
|
||||||
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock); |
|
||||||
}); |
|
||||||
|
|
||||||
preBlockPlaceholders.forEach((preBlock, index) => { |
|
||||||
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock); |
|
||||||
}); |
|
||||||
|
|
||||||
return processed; |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 4: Process images
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Process images: add max-width styling and data attributes |
|
||||||
*/ |
|
||||||
function processImages(html: string): string { |
|
||||||
const imageUrls: string[] = []; |
|
||||||
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi; |
|
||||||
let match; |
|
||||||
|
|
||||||
while ((match = imageUrlRegex.exec(html)) !== null) { |
|
||||||
const url = match[1]; |
|
||||||
if (url && !imageUrls.includes(url)) { |
|
||||||
imageUrls.push(url); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => { |
|
||||||
const srcMatch = attributes.match(/src=["']([^"']+)["']/i); |
|
||||||
if (!srcMatch) return imgTag; |
|
||||||
|
|
||||||
const src = srcMatch[1]; |
|
||||||
const currentIndex = imageUrls.indexOf(src); |
|
||||||
|
|
||||||
let updatedAttributes = attributes; |
|
||||||
|
|
||||||
if (updatedAttributes.match(/class=["']/i)) { |
|
||||||
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => { |
|
||||||
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim(); |
|
||||||
const newClasses = cleanedClasses
|
|
||||||
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in` |
|
||||||
: 'max-w-[400px] object-contain cursor-zoom-in'; |
|
||||||
return `class="${newClasses}"`; |
|
||||||
}); |
|
||||||
} else { |
|
||||||
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`; |
|
||||||
} |
|
||||||
|
|
||||||
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${escapeHtmlAttr(src)}"`; |
|
||||||
|
|
||||||
return `<img${updatedAttributes}>`; |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 6: Clean up leftover markdown
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Clean up leftover markdown syntax |
|
||||||
*/ |
|
||||||
function cleanupMarkdown(html: string): string { |
|
||||||
let cleaned = html; |
|
||||||
|
|
||||||
// Clean up markdown image syntax
|
|
||||||
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => { |
|
||||||
const altText = alt || ''; |
|
||||||
const escapedUrl = escapeHtmlAttr(url); |
|
||||||
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`; |
|
||||||
}); |
|
||||||
|
|
||||||
// Clean up markdown link syntax (skip if already HTML)
|
|
||||||
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => { |
|
||||||
// Skip if already processed
|
|
||||||
if (cleaned.includes(`href="${url}"`) || cleaned.includes(`href='${url}'`)) { |
|
||||||
return _match; |
|
||||||
} |
|
||||||
|
|
||||||
if (text.includes('<') || text.includes('>') || text.includes('&')) { |
|
||||||
return _match; |
|
||||||
} |
|
||||||
|
|
||||||
const escapedUrl = escapeHtmlAttr(url); |
|
||||||
const escapedText = escapeHtml(text); |
|
||||||
|
|
||||||
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`; |
|
||||||
}); |
|
||||||
|
|
||||||
return cleaned; |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 7: Add styling classes
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Add proper CSS classes for styling |
|
||||||
*/ |
|
||||||
function addStylingClasses(html: string): string { |
|
||||||
let styled = html; |
|
||||||
|
|
||||||
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>'); |
|
||||||
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>'); |
|
||||||
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>'); |
|
||||||
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">'); |
|
||||||
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">'); |
|
||||||
|
|
||||||
return styled; |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// STEP 8: Hide raw ToC text
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Hide raw AsciiDoc ToC text |
|
||||||
*/ |
|
||||||
function hideRawTocText(html: string): string { |
|
||||||
let cleaned = html; |
|
||||||
|
|
||||||
cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, ''); |
|
||||||
|
|
||||||
return cleaned; |
|
||||||
} |
|
||||||
|
|
||||||
// ============================================
|
|
||||||
// Utility functions
|
|
||||||
// ============================================
|
|
||||||
|
|
||||||
/** |
|
||||||
* Escape HTML content |
|
||||||
*/ |
|
||||||
function escapeHtml(text: string): string { |
|
||||||
return text |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Escape HTML attribute value |
|
||||||
*/ |
|
||||||
function escapeHtmlAttr(text: string): string { |
|
||||||
return text |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, '''); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Unescape HTML entities |
|
||||||
*/ |
|
||||||
function unescapeHtml(text: string): string { |
|
||||||
return text |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
} |
|
||||||
@ -1,239 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.extractTOC = extractTOC; |
|
||||||
exports.sanitizeHTML = sanitizeHTML; |
|
||||||
exports.processLinks = processLinks; |
|
||||||
/** |
|
||||||
* Extracts the table of contents from AsciiDoc HTML output |
|
||||||
* Returns the TOC HTML and the content HTML without the TOC |
|
||||||
*/ |
|
||||||
function extractTOC(html) { |
|
||||||
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
|
|
||||||
let tocContent = ''; |
|
||||||
let contentWithoutTOC = html; |
|
||||||
// Find the start of the TOC div - try multiple patterns
|
|
||||||
const tocStartPatterns = [ |
|
||||||
/<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>/i, |
|
||||||
/<div\s+id=["']toc["'][^>]*>/i, |
|
||||||
/<div\s+class=["']toc["'][^>]*>/i, |
|
||||||
/<nav\s+id=["']toc["'][^>]*>/i, |
|
||||||
]; |
|
||||||
let tocStartIdx = -1; |
|
||||||
let tocStartTag = ''; |
|
||||||
for (const pattern of tocStartPatterns) { |
|
||||||
const match = html.match(pattern); |
|
||||||
if (match && match.index !== undefined) { |
|
||||||
tocStartIdx = match.index; |
|
||||||
tocStartTag = match[0]; |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
if (tocStartIdx === -1) { |
|
||||||
// No TOC found
|
|
||||||
return { toc: '', contentWithoutTOC: html }; |
|
||||||
} |
|
||||||
// Find the matching closing tag by counting div/nav tags
|
|
||||||
const searchStart = tocStartIdx + tocStartTag.length; |
|
||||||
let depth = 1; |
|
||||||
let i = searchStart; |
|
||||||
while (i < html.length && depth > 0) { |
|
||||||
// Look for opening or closing div/nav tags
|
|
||||||
if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') { |
|
||||||
// Check if it's a closing tag
|
|
||||||
if (i + 5 < html.length && html[i + 4] === '/') { |
|
||||||
depth--; |
|
||||||
const closeIdx = html.indexOf('>', i); |
|
||||||
if (closeIdx === -1) |
|
||||||
break; |
|
||||||
i = closeIdx + 1; |
|
||||||
} |
|
||||||
else { |
|
||||||
// Opening tag - find the end (handle attributes and self-closing)
|
|
||||||
const closeIdx = html.indexOf('>', i); |
|
||||||
if (closeIdx === -1) |
|
||||||
break; |
|
||||||
// Check if it's self-closing (look for /> before the >)
|
|
||||||
const tagContent = html.substring(i, closeIdx); |
|
||||||
if (!tagContent.endsWith('/')) { |
|
||||||
depth++; |
|
||||||
} |
|
||||||
i = closeIdx + 1; |
|
||||||
} |
|
||||||
} |
|
||||||
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') { |
|
||||||
depth--; |
|
||||||
const closeIdx = html.indexOf('>', i); |
|
||||||
if (closeIdx === -1) |
|
||||||
break; |
|
||||||
i = closeIdx + 1; |
|
||||||
} |
|
||||||
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') { |
|
||||||
depth--; |
|
||||||
const closeIdx = html.indexOf('>', i); |
|
||||||
if (closeIdx === -1) |
|
||||||
break; |
|
||||||
i = closeIdx + 1; |
|
||||||
} |
|
||||||
else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') { |
|
||||||
// Handle opening nav tags
|
|
||||||
const closeIdx = html.indexOf('>', i); |
|
||||||
if (closeIdx === -1) |
|
||||||
break; |
|
||||||
const tagContent = html.substring(i, closeIdx); |
|
||||||
if (!tagContent.endsWith('/')) { |
|
||||||
depth++; |
|
||||||
} |
|
||||||
i = closeIdx + 1; |
|
||||||
} |
|
||||||
else { |
|
||||||
i++; |
|
||||||
} |
|
||||||
} |
|
||||||
if (depth === 0) { |
|
||||||
// Found the matching closing tag
|
|
||||||
const tocEndIdx = i; |
|
||||||
// Extract the TOC content (inner HTML)
|
|
||||||
const tocFullHTML = html.substring(tocStartIdx, tocEndIdx); |
|
||||||
// Extract just the inner content (without the outer div tags)
|
|
||||||
let innerStart = tocStartTag.length; |
|
||||||
let innerEnd = tocFullHTML.length; |
|
||||||
// Find the last </div> or </nav>
|
|
||||||
if (tocFullHTML.endsWith('</div>')) { |
|
||||||
innerEnd -= 6; |
|
||||||
} |
|
||||||
else if (tocFullHTML.endsWith('</nav>')) { |
|
||||||
innerEnd -= 7; |
|
||||||
} |
|
||||||
tocContent = tocFullHTML.substring(innerStart, innerEnd).trim(); |
|
||||||
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
|
|
||||||
tocContent = tocContent.replace(/<div\s+id=["']toctitle["'][^>]*>.*?<\/div>\s*/gis, ''); |
|
||||||
tocContent = tocContent.trim(); |
|
||||||
// Remove the TOC from the content
|
|
||||||
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx); |
|
||||||
} |
|
||||||
// Extract just the body content if the HTML includes full document structure
|
|
||||||
// AsciiDoctor might return full HTML with <html>, <head>, <body> tags
|
|
||||||
// Check if this is a full HTML document
|
|
||||||
const isFullDocument = /^\s*<!DOCTYPE|^\s*<html/i.test(contentWithoutTOC); |
|
||||||
if (isFullDocument) { |
|
||||||
// Extract body content using a more robust approach
|
|
||||||
// Find the opening <body> tag
|
|
||||||
const bodyStartMatch = contentWithoutTOC.match(/<body[^>]*>/i); |
|
||||||
if (bodyStartMatch && bodyStartMatch.index !== undefined) { |
|
||||||
const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length; |
|
||||||
// Find the closing </body> tag by searching backwards from the end
|
|
||||||
// This is more reliable than regex for nested content
|
|
||||||
const bodyEndMatch = contentWithoutTOC.lastIndexOf('</body>'); |
|
||||||
if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) { |
|
||||||
contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim(); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
// Remove any remaining document structure tags that might have slipped through
|
|
||||||
contentWithoutTOC = contentWithoutTOC |
|
||||||
.replace(/<html[^>]*>/gi, '') |
|
||||||
.replace(/<\/html>/gi, '') |
|
||||||
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '') |
|
||||||
.replace(/<body[^>]*>/gi, '') |
|
||||||
.replace(/<\/body>/gi, ''); |
|
||||||
// Clean up any extra whitespace
|
|
||||||
contentWithoutTOC = contentWithoutTOC.trim(); |
|
||||||
return { toc: tocContent, contentWithoutTOC }; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Performs basic HTML sanitization to prevent XSS |
|
||||||
*/ |
|
||||||
function sanitizeHTML(html) { |
|
||||||
// Remove script tags and their content
|
|
||||||
html = html.replace(/<script[^>]*>.*?<\/script>/gis, ''); |
|
||||||
// Remove event handlers (onclick, onerror, etc.)
|
|
||||||
html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, ''); |
|
||||||
// Remove javascript: protocol in links
|
|
||||||
html = html.replace(/javascript:/gi, ''); |
|
||||||
// Remove data: URLs that could be dangerous
|
|
||||||
html = html.replace(/data:\s*text\/html/gi, ''); |
|
||||||
return html; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Processes HTML links to add target="_blank" to external links |
|
||||||
* This function is available for use but not currently called automatically. |
|
||||||
* It can be used in post-processing if needed. |
|
||||||
*/ |
|
||||||
function processLinks(html, linkBaseURL) { |
|
||||||
// Extract domain from linkBaseURL for comparison
|
|
||||||
let linkBaseDomain = ''; |
|
||||||
if (linkBaseURL) { |
|
||||||
try { |
|
||||||
// Use URL constructor if available (Node.js 10+)
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const URLConstructor = globalThis.URL; |
|
||||||
if (URLConstructor) { |
|
||||||
const url = new URLConstructor(linkBaseURL); |
|
||||||
linkBaseDomain = url.hostname; |
|
||||||
} |
|
||||||
else { |
|
||||||
throw new Error('URL not available'); |
|
||||||
} |
|
||||||
} |
|
||||||
catch { |
|
||||||
// Fallback to simple string parsing if URL constructor fails
|
|
||||||
const url = linkBaseURL.replace(/^https?:\/\//, ''); |
|
||||||
const parts = url.split('/'); |
|
||||||
if (parts.length > 0) { |
|
||||||
linkBaseDomain = parts[0]; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
// Regex to match <a> tags with href attributes
|
|
||||||
const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g; |
|
||||||
return html.replace(linkRegex, (match, before, href, after) => { |
|
||||||
// Check if it's an external link (starts with http:// or https://)
|
|
||||||
const isExternal = href.startsWith('http://') || href.startsWith('https://'); |
|
||||||
if (isExternal) { |
|
||||||
// Check if it's pointing to our own domain
|
|
||||||
if (linkBaseDomain) { |
|
||||||
try { |
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
const URLConstructor = globalThis.URL; |
|
||||||
if (URLConstructor) { |
|
||||||
const hrefUrl = new URLConstructor(href); |
|
||||||
if (hrefUrl.hostname === linkBaseDomain) { |
|
||||||
// Same domain - open in same tab (remove any existing target attribute)
|
|
||||||
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); |
|
||||||
} |
|
||||||
} |
|
||||||
else { |
|
||||||
throw new Error('URL not available'); |
|
||||||
} |
|
||||||
} |
|
||||||
catch { |
|
||||||
// If URL parsing fails, use simple string check
|
|
||||||
if (href.includes(linkBaseDomain)) { |
|
||||||
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
|
|
||||||
if (!match.includes('target=')) { |
|
||||||
if (!match.includes('rel=')) { |
|
||||||
return match.replace('>', ' target="_blank" rel="noopener noreferrer">'); |
|
||||||
} |
|
||||||
else { |
|
||||||
// Update existing rel attribute to include noopener if not present
|
|
||||||
const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => { |
|
||||||
if (!relValue.includes('noopener')) { |
|
||||||
return `rel="${relValue} noopener noreferrer"`; |
|
||||||
} |
|
||||||
return relMatch; |
|
||||||
}); |
|
||||||
return updatedMatch.replace('>', ' target="_blank">'); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
else { |
|
||||||
// Local/relative link - ensure it opens in same tab (remove target if present)
|
|
||||||
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, ''); |
|
||||||
} |
|
||||||
return match; |
|
||||||
}); |
|
||||||
} |
|
||||||
@ -1,164 +0,0 @@ |
|||||||
/** |
|
||||||
* HTML utility functions for processing AsciiDoctor output |
|
||||||
*
|
|
||||||
* Functions: |
|
||||||
* - extractTOC: Extract table of contents from HTML |
|
||||||
* - sanitizeHTML: Sanitize HTML to prevent XSS attacks |
|
||||||
* - processLinks: Add target="_blank" to external links |
|
||||||
*/ |
|
||||||
|
|
||||||
export interface TOCResult { |
|
||||||
toc: string; |
|
||||||
contentWithoutTOC: string; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Extract table of contents from AsciiDoctor HTML output |
|
||||||
* AsciiDoctor generates a <div id="toc"> with class="toc" containing the TOC |
|
||||||
*/ |
|
||||||
export function extractTOC(html: string): TOCResult { |
|
||||||
// Match the TOC div - AsciiDoctor generates it with id="toc" and class="toc"
|
|
||||||
const tocMatch = html.match(/<div[^>]*id=["']toc["'][^>]*>([\s\S]*?)<\/div>/i); |
|
||||||
|
|
||||||
if (tocMatch) { |
|
||||||
const toc = tocMatch[0]; // Full TOC div
|
|
||||||
const contentWithoutTOC = html.replace(toc, '').trim(); |
|
||||||
return { toc, contentWithoutTOC }; |
|
||||||
} |
|
||||||
|
|
||||||
// Fallback: try to match by class="toc"
|
|
||||||
const tocClassMatch = html.match(/<div[^>]*class=["'][^"']*toc[^"']*["'][^>]*>([\s\S]*?)<\/div>/i); |
|
||||||
|
|
||||||
if (tocClassMatch) { |
|
||||||
const toc = tocClassMatch[0]; |
|
||||||
const contentWithoutTOC = html.replace(toc, '').trim(); |
|
||||||
return { toc, contentWithoutTOC }; |
|
||||||
} |
|
||||||
|
|
||||||
// No TOC found
|
|
||||||
return { |
|
||||||
toc: '', |
|
||||||
contentWithoutTOC: html, |
|
||||||
}; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Sanitize HTML to prevent XSS attacks |
|
||||||
* Removes dangerous scripts and event handlers while preserving safe HTML |
|
||||||
*
|
|
||||||
* This is a basic sanitizer. For production use, consider using a library like DOMPurify |
|
||||||
*/ |
|
||||||
export function sanitizeHTML(html: string): string { |
|
||||||
let sanitized = html; |
|
||||||
|
|
||||||
// Remove script tags and their content
|
|
||||||
sanitized = sanitized.replace(/<script[\s\S]*?<\/script>/gi, ''); |
|
||||||
|
|
||||||
// Remove event handlers from attributes (onclick, onerror, etc.)
|
|
||||||
sanitized = sanitized.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, ''); |
|
||||||
sanitized = sanitized.replace(/\s*on\w+\s*=\s*[^\s>]*/gi, ''); |
|
||||||
|
|
||||||
// Remove javascript: protocol in href and src attributes
|
|
||||||
sanitized = sanitized.replace(/href\s*=\s*["']javascript:[^"']*["']/gi, 'href="#"'); |
|
||||||
sanitized = sanitized.replace(/src\s*=\s*["']javascript:[^"']*["']/gi, 'src=""'); |
|
||||||
|
|
||||||
// Remove data: URLs that might contain scripts (allow images)
|
|
||||||
// This is more permissive - you might want to be stricter
|
|
||||||
sanitized = sanitized.replace(/src\s*=\s*["']data:text\/html[^"']*["']/gi, 'src=""'); |
|
||||||
|
|
||||||
// Remove iframe with dangerous sources
|
|
||||||
sanitized = sanitized.replace(/<iframe[^>]*src\s*=\s*["']javascript:[^"']*["'][^>]*>[\s\S]*?<\/iframe>/gi, ''); |
|
||||||
|
|
||||||
// Remove object and embed tags (often used for XSS)
|
|
||||||
sanitized = sanitized.replace(/<object[\s\S]*?<\/object>/gi, ''); |
|
||||||
sanitized = sanitized.replace(/<embed[\s\S]*?>/gi, ''); |
|
||||||
|
|
||||||
// Remove style tags with potentially dangerous content
|
|
||||||
// We keep style attributes but remove <style> tags
|
|
||||||
sanitized = sanitized.replace(/<style[\s\S]*?<\/style>/gi, ''); |
|
||||||
|
|
||||||
// Remove link tags with javascript: or data: URLs
|
|
||||||
sanitized = sanitized.replace(/<link[^>]*href\s*=\s*["'](javascript|data):[^"']*["'][^>]*>/gi, ''); |
|
||||||
|
|
||||||
// Remove meta tags with http-equiv="refresh" (can be used for redirects)
|
|
||||||
sanitized = sanitized.replace(/<meta[^>]*http-equiv\s*=\s*["']refresh["'][^>]*>/gi, ''); |
|
||||||
|
|
||||||
return sanitized; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Process links to add target="_blank" and rel="noreferrer noopener" to external links |
|
||||||
*
|
|
||||||
* External links are links that don't match the base domain. |
|
||||||
* Internal links (same domain) are left unchanged. |
|
||||||
*/ |
|
||||||
export function processLinks(html: string, linkBaseURL: string): string { |
|
||||||
if (!linkBaseURL) { |
|
||||||
return html; |
|
||||||
} |
|
||||||
|
|
||||||
// Extract base domain from linkBaseURL
|
|
||||||
let baseDomain: string | null = null; |
|
||||||
try { |
|
||||||
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/); |
|
||||||
if (urlMatch) { |
|
||||||
baseDomain = urlMatch[1]; |
|
||||||
} |
|
||||||
} catch { |
|
||||||
// If parsing fails, don't process links
|
|
||||||
return html; |
|
||||||
} |
|
||||||
|
|
||||||
if (!baseDomain) { |
|
||||||
return html; |
|
||||||
} |
|
||||||
|
|
||||||
// Process anchor tags with href attributes
|
|
||||||
return html.replace(/<a\s+([^>]*\s+)?href\s*=\s*["']([^"']+)["']([^>]*?)>/gi, (match, before, href, after) => { |
|
||||||
// Skip if already has target attribute
|
|
||||||
if (match.includes('target=')) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Skip if it's not an http/https link
|
|
||||||
if (!/^https?:\/\//i.test(href)) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Skip if it's already a special link type (nostr, wikilink, etc.)
|
|
||||||
if (match.includes('class="nostr-link"') || |
|
||||||
match.includes('class="wikilink"') || |
|
||||||
match.includes('class="hashtag-link"')) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Check if it's an external link
|
|
||||||
let isExternal = true; |
|
||||||
try { |
|
||||||
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/); |
|
||||||
if (hrefMatch && hrefMatch[1] === baseDomain) { |
|
||||||
isExternal = false; |
|
||||||
} |
|
||||||
} catch { |
|
||||||
// If parsing fails, assume external
|
|
||||||
} |
|
||||||
|
|
||||||
// Only add target="_blank" to external links
|
|
||||||
if (isExternal) { |
|
||||||
// Check if there's already a rel attribute
|
|
||||||
if (match.includes('rel=')) { |
|
||||||
// Add to existing rel attribute if it doesn't already have noreferrer noopener
|
|
||||||
if (!match.includes('noreferrer') && !match.includes('noopener')) { |
|
||||||
return match.replace(/rel\s*=\s*["']([^"']+)["']/i, 'rel="$1 noreferrer noopener"'); |
|
||||||
} |
|
||||||
// Add target="_blank" before the closing >
|
|
||||||
return match.replace(/>$/, ' target="_blank">'); |
|
||||||
} else { |
|
||||||
// Add both target and rel
|
|
||||||
return match.replace(/>$/, ' target="_blank" rel="noreferrer noopener">'); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return match; |
|
||||||
}); |
|
||||||
} |
|
||||||
@ -0,0 +1,93 @@ |
|||||||
|
import { marked } from 'marked'; |
||||||
|
// @ts-ignore - marked is ESM but we need it to work in Jest
|
||||||
|
import { ParserOptions } from '../types'; |
||||||
|
import * as emoji from 'node-emoji'; |
||||||
|
|
||||||
|
export interface MarkdownResult { |
||||||
|
html: string; |
||||||
|
frontmatter?: Record<string, any>; |
||||||
|
hasLaTeX: boolean; |
||||||
|
hasMusicalNotation: boolean; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Extract YAML frontmatter from markdown content |
||||||
|
*/ |
||||||
|
function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } { |
||||||
|
const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/; |
||||||
|
const match = content.match(frontmatterRegex); |
||||||
|
|
||||||
|
if (!match) { |
||||||
|
return { content }; |
||||||
|
} |
||||||
|
|
||||||
|
try { |
||||||
|
// Simple YAML parser for basic key-value pairs
|
||||||
|
const yamlContent = match[1]; |
||||||
|
const frontmatter: Record<string, any> = {}; |
||||||
|
const lines = yamlContent.split('\n'); |
||||||
|
|
||||||
|
for (const line of lines) { |
||||||
|
const trimmed = line.trim(); |
||||||
|
if (!trimmed || trimmed.startsWith('#')) continue; |
||||||
|
|
||||||
|
const colonIndex = trimmed.indexOf(':'); |
||||||
|
if (colonIndex === -1) continue; |
||||||
|
|
||||||
|
const key = trimmed.substring(0, colonIndex).trim(); |
||||||
|
let value = trimmed.substring(colonIndex + 1).trim(); |
||||||
|
|
||||||
|
// Remove quotes if present
|
||||||
|
if ((value.startsWith('"') && value.endsWith('"')) ||
|
||||||
|
(value.startsWith("'") && value.endsWith("'"))) { |
||||||
|
value = value.slice(1, -1); |
||||||
|
} |
||||||
|
|
||||||
|
// Handle arrays (simple case)
|
||||||
|
if (value.startsWith('[') && value.endsWith(']')) { |
||||||
|
const arrayContent = value.slice(1, -1); |
||||||
|
frontmatter[key] = arrayContent.split(',').map(v => v.trim().replace(/^["']|["']$/g, '')); |
||||||
|
} else { |
||||||
|
frontmatter[key] = value; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return { |
||||||
|
frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, |
||||||
|
content: content.substring(match[0].length) |
||||||
|
}; |
||||||
|
} catch (e) { |
||||||
|
return { content }; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Process Markdown content to HTML (minimal markdown support) |
||||||
|
*/ |
||||||
|
export function processMarkdown(content: string, options: ParserOptions): MarkdownResult { |
||||||
|
// Extract frontmatter
|
||||||
|
const { frontmatter, content: contentWithoutFrontmatter } = extractFrontmatter(content); |
||||||
|
|
||||||
|
// Detect LaTeX and musical notation
|
||||||
|
const hasLaTeX = /```latex|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content); |
||||||
|
const hasMusicalNotation = /```abc|```music/i.test(content); |
||||||
|
|
||||||
|
// Configure marked for minimal markdown
|
||||||
|
marked.setOptions({ |
||||||
|
gfm: true, |
||||||
|
breaks: false |
||||||
|
}); |
||||||
|
|
||||||
|
// Process emoji shortcodes before markdown processing
|
||||||
|
let processedContent = emoji.emojify(contentWithoutFrontmatter); |
||||||
|
|
||||||
|
// Convert markdown to HTML
|
||||||
|
const html = marked.parse(processedContent) as string; |
||||||
|
|
||||||
|
return { |
||||||
|
html, |
||||||
|
frontmatter, |
||||||
|
hasLaTeX, |
||||||
|
hasMusicalNotation |
||||||
|
}; |
||||||
|
} |
||||||
@ -1,143 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.processMusicalNotation = processMusicalNotation; |
|
||||||
/** |
|
||||||
* Processes musical notation in HTML content |
|
||||||
* Wraps musical notation in appropriate HTML for rendering |
|
||||||
*/ |
|
||||||
function processMusicalNotation(html) { |
|
||||||
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
|
|
||||||
// These were created by a buggy regex that matched the entire HTML document
|
|
||||||
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => { |
|
||||||
// This is corrupted - extract just the ABC notation from the beginning
|
|
||||||
let decoded = dataAbc |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
// Find the actual ABC notation (starts with X:)
|
|
||||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|<|<\/|sect|div|pre|code)/); |
|
||||||
if (abcMatch) { |
|
||||||
const cleanAbc = abcMatch[1].trim(); |
|
||||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`; |
|
||||||
} |
|
||||||
// If we can't extract clean ABC, remove the div entirely
|
|
||||||
return content; |
|
||||||
}); |
|
||||||
// Clean up code blocks that contain corrupted abc-notation divs inside them
|
|
||||||
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
|
|
||||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
|
||||||
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
|
|
||||||
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i); |
|
||||||
if (longDataAbcMatch) { |
|
||||||
// Extract just the ABC notation from the beginning of the corrupted data-abc value
|
|
||||||
let decoded = longDataAbcMatch[1] |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
// The ABC notation ends where the HTML document starts (</code> or </pre>)
|
|
||||||
// Extract everything from X: up to (but not including) </code> or </pre>
|
|
||||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=<\/code>|<\/pre>)/); |
|
||||||
if (abcMatch) { |
|
||||||
let cleanAbc = abcMatch[1].trim(); |
|
||||||
// Remove any trailing HTML entities
|
|
||||||
cleanAbc = cleanAbc.replace(/<.*$/, '').trim(); |
|
||||||
// Validate it's reasonable ABC notation
|
|
||||||
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) { |
|
||||||
// Return clean code block - the processing step will wrap it in abc-notation div
|
|
||||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`; |
|
||||||
} |
|
||||||
} |
|
||||||
// If extraction fails, just remove the corrupted div and return empty code block
|
|
||||||
// This prevents the corrupted data from being rendered
|
|
||||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`; |
|
||||||
} |
|
||||||
return match; |
|
||||||
}); |
|
||||||
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
|
|
||||||
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
|
|
||||||
// We do NOT auto-detect ABC notation - it must be explicitly marked
|
|
||||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
|
||||||
// Skip if already processed or corrupted
|
|
||||||
if (codeContent.includes('abc-notation') || |
|
||||||
codeContent.includes('class="abc-notation"') || |
|
||||||
codeContent.includes('<div') || |
|
||||||
codeContent.includes('</div>') || |
|
||||||
codeContent.length > 5000) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
// Extract ABC content from the code block
|
|
||||||
let abcContent = codeContent |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'") |
|
||||||
.replace(/'/g, "'") |
|
||||||
.replace(///g, '/'); |
|
||||||
// Remove any HTML tags
|
|
||||||
abcContent = abcContent.replace(/<[^>]+>/g, '').trim(); |
|
||||||
// Only process if it looks like valid ABC notation (starts with X:)
|
|
||||||
// Since this is explicitly marked as ABC, we trust it's ABC notation
|
|
||||||
if (abcContent.match(/^X:\s*\d+/m) && |
|
||||||
abcContent.length < 3000 && |
|
||||||
!abcContent.includes('</') && |
|
||||||
!abcContent.includes('<div') && |
|
||||||
!abcContent.includes('sect') && |
|
||||||
!abcContent.includes('class=')) { |
|
||||||
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
|
|
||||||
const lines = abcContent.split('\n'); |
|
||||||
const abcLines = []; |
|
||||||
for (const line of lines) { |
|
||||||
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) { |
|
||||||
break; |
|
||||||
} |
|
||||||
if (line.length > 200) { |
|
||||||
break; |
|
||||||
} |
|
||||||
abcLines.push(line); |
|
||||||
if (abcLines.join('\n').length > 2000) { |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
const cleanAbc = abcLines.join('\n').trim(); |
|
||||||
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) { |
|
||||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`; |
|
||||||
} |
|
||||||
} |
|
||||||
return match; |
|
||||||
}); |
|
||||||
// Process LilyPond notation blocks
|
|
||||||
const lilypondPattern = /(\\relative[^}]+})/gs; |
|
||||||
html = html.replace(lilypondPattern, (match) => { |
|
||||||
const lilypondContent = match.trim(); |
|
||||||
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`; |
|
||||||
}); |
|
||||||
// Process inline chord notation: [C], [Am], [F#m7], etc.
|
|
||||||
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g; |
|
||||||
html = html.replace(chordPattern, (match, chord) => { |
|
||||||
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`; |
|
||||||
}); |
|
||||||
// Process MusicXML-like notation
|
|
||||||
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs; |
|
||||||
html = html.replace(musicxmlPattern, (match) => { |
|
||||||
const musicxmlContent = match.trim(); |
|
||||||
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`; |
|
||||||
}); |
|
||||||
return html; |
|
||||||
} |
|
||||||
/** |
|
||||||
* Escapes a string for use in HTML attributes |
|
||||||
*/ |
|
||||||
function escapeForAttr(text) { |
|
||||||
return text |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, ''') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/\n/g, ' ') |
|
||||||
.replace(/\r/g, ''); |
|
||||||
} |
|
||||||
@ -1,152 +0,0 @@ |
|||||||
/** |
|
||||||
* Processes musical notation in HTML content |
|
||||||
* Wraps musical notation in appropriate HTML for rendering |
|
||||||
*/ |
|
||||||
export function processMusicalNotation(html: string): string { |
|
||||||
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
|
|
||||||
// These were created by a buggy regex that matched the entire HTML document
|
|
||||||
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => { |
|
||||||
// This is corrupted - extract just the ABC notation from the beginning
|
|
||||||
let decoded = dataAbc |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
|
|
||||||
// Find the actual ABC notation (starts with X:)
|
|
||||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|<|<\/|sect|div|pre|code)/); |
|
||||||
if (abcMatch) { |
|
||||||
const cleanAbc = abcMatch[1].trim(); |
|
||||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`; |
|
||||||
} |
|
||||||
// If we can't extract clean ABC, remove the div entirely
|
|
||||||
return content; |
|
||||||
}); |
|
||||||
|
|
||||||
// Clean up code blocks that contain corrupted abc-notation divs inside them
|
|
||||||
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
|
|
||||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
|
||||||
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
|
|
||||||
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i); |
|
||||||
if (longDataAbcMatch) { |
|
||||||
// Extract just the ABC notation from the beginning of the corrupted data-abc value
|
|
||||||
let decoded = longDataAbcMatch[1] |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'"); |
|
||||||
|
|
||||||
// The ABC notation ends where the HTML document starts (</code> or </pre>)
|
|
||||||
// Extract everything from X: up to (but not including) </code> or </pre>
|
|
||||||
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=<\/code>|<\/pre>)/); |
|
||||||
if (abcMatch) { |
|
||||||
let cleanAbc = abcMatch[1].trim(); |
|
||||||
// Remove any trailing HTML entities
|
|
||||||
cleanAbc = cleanAbc.replace(/<.*$/, '').trim(); |
|
||||||
// Validate it's reasonable ABC notation
|
|
||||||
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) { |
|
||||||
// Return clean code block - the processing step will wrap it in abc-notation div
|
|
||||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`; |
|
||||||
} |
|
||||||
} |
|
||||||
// If extraction fails, just remove the corrupted div and return empty code block
|
|
||||||
// This prevents the corrupted data from being rendered
|
|
||||||
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`; |
|
||||||
} |
|
||||||
return match; |
|
||||||
}); |
|
||||||
|
|
||||||
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
|
|
||||||
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
|
|
||||||
// We do NOT auto-detect ABC notation - it must be explicitly marked
|
|
||||||
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => { |
|
||||||
// Skip if already processed or corrupted
|
|
||||||
if (codeContent.includes('abc-notation') ||
|
|
||||||
codeContent.includes('class="abc-notation"') || |
|
||||||
codeContent.includes('<div') || |
|
||||||
codeContent.includes('</div>') || |
|
||||||
codeContent.length > 5000) { |
|
||||||
return match; |
|
||||||
} |
|
||||||
|
|
||||||
// Extract ABC content from the code block
|
|
||||||
let abcContent = codeContent |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/&/g, '&') |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, "'") |
|
||||||
.replace(/'/g, "'") |
|
||||||
.replace(///g, '/'); |
|
||||||
|
|
||||||
// Remove any HTML tags
|
|
||||||
abcContent = abcContent.replace(/<[^>]+>/g, '').trim(); |
|
||||||
|
|
||||||
// Only process if it looks like valid ABC notation (starts with X:)
|
|
||||||
// Since this is explicitly marked as ABC, we trust it's ABC notation
|
|
||||||
if (abcContent.match(/^X:\s*\d+/m) &&
|
|
||||||
abcContent.length < 3000 && |
|
||||||
!abcContent.includes('</') && |
|
||||||
!abcContent.includes('<div') && |
|
||||||
!abcContent.includes('sect') && |
|
||||||
!abcContent.includes('class=')) { |
|
||||||
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
|
|
||||||
const lines = abcContent.split('\n'); |
|
||||||
const abcLines: string[] = []; |
|
||||||
for (const line of lines) { |
|
||||||
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) { |
|
||||||
break; |
|
||||||
} |
|
||||||
if (line.length > 200) { |
|
||||||
break; |
|
||||||
} |
|
||||||
abcLines.push(line); |
|
||||||
if (abcLines.join('\n').length > 2000) { |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
const cleanAbc = abcLines.join('\n').trim(); |
|
||||||
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) { |
|
||||||
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`; |
|
||||||
} |
|
||||||
} |
|
||||||
return match; |
|
||||||
}); |
|
||||||
|
|
||||||
// Process LilyPond notation blocks
|
|
||||||
const lilypondPattern = /(\\relative[^}]+})/gs; |
|
||||||
html = html.replace(lilypondPattern, (match) => { |
|
||||||
const lilypondContent = match.trim(); |
|
||||||
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`; |
|
||||||
}); |
|
||||||
|
|
||||||
// Process inline chord notation: [C], [Am], [F#m7], etc.
|
|
||||||
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g; |
|
||||||
html = html.replace(chordPattern, (match, chord) => { |
|
||||||
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`; |
|
||||||
}); |
|
||||||
|
|
||||||
// Process MusicXML-like notation
|
|
||||||
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs; |
|
||||||
html = html.replace(musicxmlPattern, (match) => { |
|
||||||
const musicxmlContent = match.trim(); |
|
||||||
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`; |
|
||||||
}); |
|
||||||
|
|
||||||
return html; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Escapes a string for use in HTML attributes |
|
||||||
*/ |
|
||||||
function escapeForAttr(text: string): string { |
|
||||||
return text |
|
||||||
.replace(/"/g, '"') |
|
||||||
.replace(/'/g, ''') |
|
||||||
.replace(/</g, '<') |
|
||||||
.replace(/>/g, '>') |
|
||||||
.replace(/\n/g, ' ') |
|
||||||
.replace(/\r/g, ''); |
|
||||||
} |
|
||||||
@ -1,14 +0,0 @@ |
|||||||
"use strict"; |
|
||||||
Object.defineProperty(exports, "__esModule", { value: true }); |
|
||||||
exports.ContentFormat = void 0; |
|
||||||
/** |
|
||||||
* Detected content format |
|
||||||
*/ |
|
||||||
var ContentFormat; |
|
||||||
(function (ContentFormat) { |
|
||||||
ContentFormat["Unknown"] = "unknown"; |
|
||||||
ContentFormat["AsciiDoc"] = "asciidoc"; |
|
||||||
ContentFormat["Markdown"] = "markdown"; |
|
||||||
ContentFormat["Wikipedia"] = "wikipedia"; |
|
||||||
ContentFormat["Plain"] = "plain"; |
|
||||||
})(ContentFormat || (exports.ContentFormat = ContentFormat = {})); |
|
||||||
@ -1,20 +0,0 @@ |
|||||||
/** |
|
||||||
* Type declarations for @asciidoctor/core |
|
||||||
* These are minimal types - the actual types should come from the package |
|
||||||
*/ |
|
||||||
declare module '@asciidoctor/core' { |
|
||||||
interface ConvertOptions { |
|
||||||
safe?: string; |
|
||||||
backend?: string; |
|
||||||
doctype?: string; |
|
||||||
attributes?: Record<string, any>; |
|
||||||
extension_registry?: any; |
|
||||||
} |
|
||||||
|
|
||||||
interface Asciidoctor { |
|
||||||
convert(content: string, options?: ConvertOptions): string | any; |
|
||||||
} |
|
||||||
|
|
||||||
function asciidoctor(): Asciidoctor; |
|
||||||
export default asciidoctor; |
|
||||||
} |
|
||||||
@ -1,732 +0,0 @@ |
|||||||
import { Parser } from '../parser'; |
|
||||||
import * as fs from 'fs'; |
|
||||||
import * as path from 'path'; |
|
||||||
import { ProcessResult } from '../types'; |
|
||||||
|
|
||||||
/** |
|
||||||
* Shared utilities for generating test reports |
|
||||||
*/ |
|
||||||
|
|
||||||
export interface TestData { |
|
||||||
original: string; |
|
||||||
result: ProcessResult; |
|
||||||
} |
|
||||||
|
|
||||||
export interface ReportData { |
|
||||||
markdown: TestData; |
|
||||||
asciidoc: TestData; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Generate HTML test report from parsed documents |
|
||||||
*/ |
|
||||||
export function generateHTMLReport(data: ReportData): string { |
|
||||||
const { markdown, asciidoc } = data; |
|
||||||
|
|
||||||
return `<!DOCTYPE html>
|
|
||||||
<html lang="en"> |
|
||||||
<head> |
|
||||||
<meta charset="UTF-8"> |
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
||||||
<title>GC Parser Test Report</title> |
|
||||||
<style> |
|
||||||
* { |
|
||||||
margin: 0; |
|
||||||
padding: 0; |
|
||||||
box-sizing: border-box; |
|
||||||
} |
|
||||||
|
|
||||||
body { |
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; |
|
||||||
line-height: 1.6; |
|
||||||
color: #333; |
|
||||||
background: #f5f5f5; |
|
||||||
padding: 20px; |
|
||||||
} |
|
||||||
|
|
||||||
.container { |
|
||||||
max-width: 1400px; |
|
||||||
margin: 0 auto; |
|
||||||
} |
|
||||||
|
|
||||||
h1 { |
|
||||||
color: #2c3e50; |
|
||||||
margin-bottom: 10px; |
|
||||||
font-size: 2.5em; |
|
||||||
} |
|
||||||
|
|
||||||
.subtitle { |
|
||||||
color: #7f8c8d; |
|
||||||
margin-bottom: 30px; |
|
||||||
font-size: 1.1em; |
|
||||||
} |
|
||||||
|
|
||||||
.section { |
|
||||||
background: white; |
|
||||||
border-radius: 8px; |
|
||||||
padding: 30px; |
|
||||||
margin-bottom: 30px; |
|
||||||
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
||||||
} |
|
||||||
|
|
||||||
.section h2 { |
|
||||||
color: #34495e; |
|
||||||
margin-bottom: 20px; |
|
||||||
padding-bottom: 10px; |
|
||||||
border-bottom: 2px solid #3498db; |
|
||||||
font-size: 1.8em; |
|
||||||
} |
|
||||||
|
|
||||||
.section h3 { |
|
||||||
color: #2c3e50; |
|
||||||
margin-top: 25px; |
|
||||||
margin-bottom: 15px; |
|
||||||
font-size: 1.3em; |
|
||||||
} |
|
||||||
|
|
||||||
.tabs { |
|
||||||
display: flex; |
|
||||||
gap: 10px; |
|
||||||
margin-bottom: 20px; |
|
||||||
border-bottom: 2px solid #e0e0e0; |
|
||||||
} |
|
||||||
|
|
||||||
.tab { |
|
||||||
padding: 12px 24px; |
|
||||||
background: #f8f9fa; |
|
||||||
border: none; |
|
||||||
border-top-left-radius: 6px; |
|
||||||
border-top-right-radius: 6px; |
|
||||||
cursor: pointer; |
|
||||||
font-size: 1em; |
|
||||||
font-weight: 500; |
|
||||||
color: #555; |
|
||||||
transition: all 0.2s; |
|
||||||
} |
|
||||||
|
|
||||||
.tab:hover { |
|
||||||
background: #e9ecef; |
|
||||||
} |
|
||||||
|
|
||||||
.tab.active { |
|
||||||
background: #3498db; |
|
||||||
color: white; |
|
||||||
} |
|
||||||
|
|
||||||
.tab-content { |
|
||||||
display: none; |
|
||||||
} |
|
||||||
|
|
||||||
.tab-content.active { |
|
||||||
display: block; |
|
||||||
} |
|
||||||
|
|
||||||
.metadata-grid { |
|
||||||
display: grid; |
|
||||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); |
|
||||||
gap: 15px; |
|
||||||
margin-top: 15px; |
|
||||||
} |
|
||||||
|
|
||||||
.metadata-item { |
|
||||||
background: #f8f9fa; |
|
||||||
padding: 12px; |
|
||||||
border-radius: 4px; |
|
||||||
border-left: 3px solid #3498db; |
|
||||||
} |
|
||||||
|
|
||||||
.metadata-item strong { |
|
||||||
color: #2c3e50; |
|
||||||
display: block; |
|
||||||
margin-bottom: 5px; |
|
||||||
} |
|
||||||
|
|
||||||
.metadata-item code { |
|
||||||
background: #e9ecef; |
|
||||||
padding: 2px 6px; |
|
||||||
border-radius: 3px; |
|
||||||
font-size: 0.9em; |
|
||||||
} |
|
||||||
|
|
||||||
.code-block { |
|
||||||
background: #2d2d2d; |
|
||||||
color: #f8f8f2; |
|
||||||
padding: 15px; |
|
||||||
border-radius: 6px; |
|
||||||
overflow-x: auto; |
|
||||||
font-family: 'Courier New', monospace; |
|
||||||
font-size: 0.9em; |
|
||||||
line-height: 1.5; |
|
||||||
margin: 15px 0; |
|
||||||
max-height: 400px; |
|
||||||
overflow-y: auto; |
|
||||||
} |
|
||||||
|
|
||||||
.code-block pre { |
|
||||||
margin: 0; |
|
||||||
white-space: pre-wrap; |
|
||||||
word-wrap: break-word; |
|
||||||
} |
|
||||||
|
|
||||||
.rendered-output { |
|
||||||
background: white; |
|
||||||
border: 1px solid #ddd; |
|
||||||
padding: 20px; |
|
||||||
border-radius: 6px; |
|
||||||
margin: 15px 0; |
|
||||||
min-height: 200px; |
|
||||||
} |
|
||||||
|
|
||||||
.rendered-output * { |
|
||||||
max-width: 100%; |
|
||||||
} |
|
||||||
|
|
||||||
.stats { |
|
||||||
display: grid; |
|
||||||
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); |
|
||||||
gap: 15px; |
|
||||||
margin-top: 20px; |
|
||||||
} |
|
||||||
|
|
||||||
.stat-card { |
|
||||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
||||||
color: white; |
|
||||||
padding: 20px; |
|
||||||
border-radius: 8px; |
|
||||||
text-align: center; |
|
||||||
} |
|
||||||
|
|
||||||
.stat-card .number { |
|
||||||
font-size: 2.5em; |
|
||||||
font-weight: bold; |
|
||||||
margin-bottom: 5px; |
|
||||||
} |
|
||||||
|
|
||||||
.stat-card .label { |
|
||||||
font-size: 0.9em; |
|
||||||
opacity: 0.9; |
|
||||||
} |
|
||||||
|
|
||||||
.list-item { |
|
||||||
background: #f8f9fa; |
|
||||||
padding: 8px 12px; |
|
||||||
margin: 5px 0; |
|
||||||
border-radius: 4px; |
|
||||||
border-left: 3px solid #95a5a6; |
|
||||||
} |
|
||||||
|
|
||||||
.list-item code { |
|
||||||
background: #e9ecef; |
|
||||||
padding: 2px 6px; |
|
||||||
border-radius: 3px; |
|
||||||
font-size: 0.85em; |
|
||||||
} |
|
||||||
|
|
||||||
.success-badge { |
|
||||||
display: inline-block; |
|
||||||
background: #27ae60; |
|
||||||
color: white; |
|
||||||
padding: 4px 12px; |
|
||||||
border-radius: 12px; |
|
||||||
font-size: 0.85em; |
|
||||||
font-weight: 500; |
|
||||||
margin-left: 10px; |
|
||||||
} |
|
||||||
|
|
||||||
.warning-badge { |
|
||||||
display: inline-block; |
|
||||||
background: #f39c12; |
|
||||||
color: white; |
|
||||||
padding: 4px 12px; |
|
||||||
border-radius: 12px; |
|
||||||
font-size: 0.85em; |
|
||||||
font-weight: 500; |
|
||||||
margin-left: 10px; |
|
||||||
} |
|
||||||
|
|
||||||
.comparison { |
|
||||||
display: grid; |
|
||||||
grid-template-columns: 1fr 1fr; |
|
||||||
gap: 20px; |
|
||||||
margin-top: 20px; |
|
||||||
} |
|
||||||
|
|
||||||
@media (max-width: 768px) { |
|
||||||
.comparison { |
|
||||||
grid-template-columns: 1fr; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
.json-view { |
|
||||||
background: #f8f9fa; |
|
||||||
padding: 15px; |
|
||||||
border-radius: 6px; |
|
||||||
overflow-x: auto; |
|
||||||
font-family: 'Courier New', monospace; |
|
||||||
font-size: 0.85em; |
|
||||||
max-height: 300px; |
|
||||||
overflow-y: auto; |
|
||||||
} |
|
||||||
</style> |
|
||||||
</head> |
|
||||||
<body> |
|
||||||
<div class="container"> |
|
||||||
<h1>GC Parser Test Report</h1> |
|
||||||
<p class="subtitle">Generated: ${new Date().toLocaleString()}</p> |
|
||||||
|
|
||||||
<!-- Markdown Section --> |
|
||||||
<div class="section"> |
|
||||||
<h2>Markdown Document Test <span class="success-badge">✓ Parsed</span></h2> |
|
||||||
|
|
||||||
<div class="tabs"> |
|
||||||
<button class="tab active" onclick="showTab('md-overview')">Overview</button> |
|
||||||
<button class="tab" onclick="showTab('md-original')">Original Content</button> |
|
||||||
<button class="tab" onclick="showTab('md-rendered')">Rendered Output</button> |
|
||||||
<button class="tab" onclick="showTab('md-metadata')">Metadata</button> |
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="md-overview" class="tab-content active"> |
|
||||||
<div class="stats"> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${markdown.result.nostrLinks.length}</div> |
|
||||||
<div class="label">Nostr Links</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${markdown.result.wikilinks.length}</div> |
|
||||||
<div class="label">Wikilinks</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${markdown.result.hashtags.length}</div> |
|
||||||
<div class="label">Hashtags</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${markdown.result.links.length}</div> |
|
||||||
<div class="label">Links</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${markdown.result.media.length}</div> |
|
||||||
<div class="label">Media URLs</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${markdown.result.hasLaTeX ? 'Yes' : 'No'}</div> |
|
||||||
<div class="label">Has LaTeX</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${markdown.result.hasMusicalNotation ? 'Yes' : 'No'}</div> |
|
||||||
<div class="label">Has Music</div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
|
|
||||||
<h3>Frontmatter</h3> |
|
||||||
${markdown.result.frontmatter ? ` |
|
||||||
<div class="metadata-grid"> |
|
||||||
${Object.entries(markdown.result.frontmatter).map(([key, value]) => ` |
|
||||||
<div class="metadata-item"> |
|
||||||
<strong>${escapeHtml(key)}</strong> |
|
||||||
<code>${escapeHtml(JSON.stringify(value))}</code> |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
</div> |
|
||||||
` : '<p><em>No frontmatter found</em></p>'}
|
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="md-original" class="tab-content"> |
|
||||||
<h3>Original Markdown Content</h3> |
|
||||||
<div class="code-block"> |
|
||||||
<pre>${escapeHtml(markdown.original)}</pre> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="md-rendered" class="tab-content"> |
|
||||||
<h3>Rendered HTML Output</h3> |
|
||||||
<div class="rendered-output"> |
|
||||||
${cleanHtmlContent(markdown.result.content)} |
|
||||||
</div> |
|
||||||
<details style="margin-top: 15px;"> |
|
||||||
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML (Final Processed State)</summary> |
|
||||||
<div class="code-block" style="margin-top: 10px;"> |
|
||||||
<pre>${escapeHtml(markdown.result.content)}</pre> |
|
||||||
</div> |
|
||||||
</details> |
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="md-metadata" class="tab-content"> |
|
||||||
<h3>Extracted Metadata</h3> |
|
||||||
|
|
||||||
${markdown.result.nostrLinks.length > 0 ? ` |
|
||||||
<h4>Nostr Links (${markdown.result.nostrLinks.length})</h4> |
|
||||||
${markdown.result.nostrLinks.map((link: any) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code> |
|
||||||
${link.text ? ` - ${escapeHtml(link.text)}` : ''} |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${markdown.result.wikilinks.length > 0 ? ` |
|
||||||
<h4>Wikilinks (${markdown.result.wikilinks.length})</h4> |
|
||||||
${markdown.result.wikilinks.map((wl: any) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<code>${escapeHtml(wl.original)}</code> → dtag: <code>${escapeHtml(wl.dtag)}</code> |
|
||||||
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''} |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${markdown.result.hashtags.length > 0 ? ` |
|
||||||
<h4>Hashtags (${markdown.result.hashtags.length})</h4> |
|
||||||
${markdown.result.hashtags.map((tag: string) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<code>#${escapeHtml(tag)}</code> |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${markdown.result.links.length > 0 ? ` |
|
||||||
<h4>Links (${markdown.result.links.length})</h4> |
|
||||||
${markdown.result.links.map((link: any) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a> |
|
||||||
${link.isExternal ? '<span class="warning-badge">External</span>' : ''} |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${markdown.result.media.length > 0 ? ` |
|
||||||
<h4>Media URLs (${markdown.result.media.length})</h4> |
|
||||||
${markdown.result.media.map((url: string) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a> |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${markdown.result.tableOfContents ? ` |
|
||||||
<h4>Table of Contents</h4> |
|
||||||
<div class="rendered-output"> |
|
||||||
${markdown.result.tableOfContents} |
|
||||||
</div> |
|
||||||
` : ''}
|
|
||||||
</div> |
|
||||||
</div> |
|
||||||
|
|
||||||
<!-- AsciiDoc Section --> |
|
||||||
<div class="section"> |
|
||||||
<h2>AsciiDoc Document Test <span class="success-badge">✓ Parsed</span></h2> |
|
||||||
|
|
||||||
<div class="tabs"> |
|
||||||
<button class="tab active" onclick="showTab('ad-overview')">Overview</button> |
|
||||||
<button class="tab" onclick="showTab('ad-original')">Original Content</button> |
|
||||||
<button class="tab" onclick="showTab('ad-rendered')">Rendered Output</button> |
|
||||||
<button class="tab" onclick="showTab('ad-metadata')">Metadata</button> |
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="ad-overview" class="tab-content active"> |
|
||||||
<div class="stats"> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${asciidoc.result.nostrLinks.length}</div> |
|
||||||
<div class="label">Nostr Links</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${asciidoc.result.wikilinks.length}</div> |
|
||||||
<div class="label">Wikilinks</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${asciidoc.result.hashtags.length}</div> |
|
||||||
<div class="label">Hashtags</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${asciidoc.result.links.length}</div> |
|
||||||
<div class="label">Links</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${asciidoc.result.media.length}</div> |
|
||||||
<div class="label">Media URLs</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${asciidoc.result.hasLaTeX ? 'Yes' : 'No'}</div> |
|
||||||
<div class="label">Has LaTeX</div> |
|
||||||
</div> |
|
||||||
<div class="stat-card"> |
|
||||||
<div class="number">${asciidoc.result.hasMusicalNotation ? 'Yes' : 'No'}</div> |
|
||||||
<div class="label">Has Music</div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
|
|
||||||
<h3>Frontmatter</h3> |
|
||||||
${asciidoc.result.frontmatter ? ` |
|
||||||
<div class="metadata-grid"> |
|
||||||
${Object.entries(asciidoc.result.frontmatter).map(([key, value]) => ` |
|
||||||
<div class="metadata-item"> |
|
||||||
<strong>${escapeHtml(key)}</strong> |
|
||||||
<code>${escapeHtml(JSON.stringify(value))}</code> |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
</div> |
|
||||||
` : '<p><em>No frontmatter found</em></p>'}
|
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="ad-original" class="tab-content"> |
|
||||||
<h3>Original AsciiDoc Content</h3> |
|
||||||
<div class="code-block"> |
|
||||||
<pre>${escapeHtml(asciidoc.original)}</pre> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="ad-rendered" class="tab-content"> |
|
||||||
<h3>Rendered HTML Output</h3> |
|
||||||
<div class="rendered-output"> |
|
||||||
${cleanHtmlContent(asciidoc.result.content)} |
|
||||||
</div> |
|
||||||
<details style="margin-top: 15px;"> |
|
||||||
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML (Final Processed State)</summary> |
|
||||||
<div class="code-block" style="margin-top: 10px;"> |
|
||||||
<pre>${escapeHtml(asciidoc.result.content)}</pre> |
|
||||||
</div> |
|
||||||
</details> |
|
||||||
</div> |
|
||||||
|
|
||||||
<div id="ad-metadata" class="tab-content"> |
|
||||||
<h3>Extracted Metadata</h3> |
|
||||||
|
|
||||||
${asciidoc.result.nostrLinks.length > 0 ? ` |
|
||||||
<h4>Nostr Links (${asciidoc.result.nostrLinks.length})</h4> |
|
||||||
${asciidoc.result.nostrLinks.map((link: any) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code> |
|
||||||
${link.text ? ` - ${escapeHtml(link.text)}` : ''} |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${asciidoc.result.wikilinks.length > 0 ? ` |
|
||||||
<h4>Wikilinks (${asciidoc.result.wikilinks.length})</h4> |
|
||||||
${asciidoc.result.wikilinks.map((wl: any) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<code>${escapeHtml(wl.original)}</code> → dtag: <code>${escapeHtml(wl.dtag)}</code> |
|
||||||
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''} |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${asciidoc.result.hashtags.length > 0 ? ` |
|
||||||
<h4>Hashtags (${asciidoc.result.hashtags.length})</h4> |
|
||||||
${asciidoc.result.hashtags.map((tag: string) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<code>#${escapeHtml(tag)}</code> |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${asciidoc.result.links.length > 0 ? ` |
|
||||||
<h4>Links (${asciidoc.result.links.length})</h4> |
|
||||||
${asciidoc.result.links.map((link: any) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a> |
|
||||||
${link.isExternal ? '<span class="warning-badge">External</span>' : ''} |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${asciidoc.result.media.length > 0 ? ` |
|
||||||
<h4>Media URLs (${asciidoc.result.media.length})</h4> |
|
||||||
${asciidoc.result.media.map((url: string) => ` |
|
||||||
<div class="list-item"> |
|
||||||
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a> |
|
||||||
</div> |
|
||||||
`).join('')}
|
|
||||||
` : ''}
|
|
||||||
|
|
||||||
${asciidoc.result.tableOfContents ? ` |
|
||||||
<h4>Table of Contents</h4> |
|
||||||
<div class="rendered-output"> |
|
||||||
${asciidoc.result.tableOfContents} |
|
||||||
</div> |
|
||||||
` : ''}
|
|
||||||
</div> |
|
||||||
</div> |
|
||||||
</div> |
|
||||||
|
|
||||||
<script> |
|
||||||
function showTab(tabId) { |
|
||||||
// Hide all tab contents
|
|
||||||
const allContents = document.querySelectorAll('.tab-content'); |
|
||||||
allContents.forEach(content => content.classList.remove('active')); |
|
||||||
|
|
||||||
// Remove active class from all tabs
|
|
||||||
const allTabs = document.querySelectorAll('.tab'); |
|
||||||
allTabs.forEach(tab => tab.classList.remove('active')); |
|
||||||
|
|
||||||
// Show selected tab content
|
|
||||||
const selectedContent = document.getElementById(tabId); |
|
||||||
if (selectedContent) { |
|
||||||
selectedContent.classList.add('active'); |
|
||||||
} |
|
||||||
|
|
||||||
// Add active class to clicked tab
|
|
||||||
event.target.classList.add('active'); |
|
||||||
} |
|
||||||
</script> |
|
||||||
</body> |
|
||||||
</html>`;
|
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Clean HTML content to extract only the body content |
|
||||||
* Removes full HTML document structure if present |
|
||||||
* Prevents infinite loops by ensuring we only extract once and handle nested structures |
|
||||||
* Also detects and prevents content duplication (doom loops) |
|
||||||
*/ |
|
||||||
function cleanHtmlContent(html: string): string { |
|
||||||
if (!html || typeof html !== 'string') { |
|
||||||
return ''; |
|
||||||
} |
|
||||||
|
|
||||||
let cleaned = html.trim(); |
|
||||||
|
|
||||||
// Count occurrences to detect nested structures
|
|
||||||
const htmlTagCount = (cleaned.match(/<html[^>]*>/gi) || []).length; |
|
||||||
const bodyTagCount = (cleaned.match(/<body[^>]*>/gi) || []).length; |
|
||||||
const bodyCloseCount = (cleaned.match(/<\/body>/gi) || []).length; |
|
||||||
|
|
||||||
// If we have multiple body tags, there might be nested structures
|
|
||||||
// Extract only the outermost body content
|
|
||||||
if (bodyTagCount > 0 && bodyCloseCount > 0) { |
|
||||||
// Find the first <body> tag
|
|
||||||
const firstBodyIndex = cleaned.indexOf('<body'); |
|
||||||
if (firstBodyIndex !== -1) { |
|
||||||
// Find the opening > of the first body tag
|
|
||||||
const bodyTagEnd = cleaned.indexOf('>', firstBodyIndex); |
|
||||||
if (bodyTagEnd !== -1) { |
|
||||||
const bodyStart = bodyTagEnd + 1; |
|
||||||
// Find the last </body> tag (to handle nested structures)
|
|
||||||
const bodyEnd = cleaned.lastIndexOf('</body>'); |
|
||||||
|
|
||||||
if (bodyEnd > bodyStart) { |
|
||||||
cleaned = cleaned.substring(bodyStart, bodyEnd).trim(); |
|
||||||
|
|
||||||
// Recursively clean if there are still nested structures
|
|
||||||
// But limit recursion to prevent infinite loops
|
|
||||||
const remainingBodyTags = (cleaned.match(/<body[^>]*>/gi) || []).length; |
|
||||||
if (remainingBodyTags > 0 && remainingBodyTags < bodyTagCount) { |
|
||||||
// There are still nested body tags, clean again but only once more
|
|
||||||
cleaned = cleaned.replace(/<body[^>]*>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<\/body>/gi, ''); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Remove any remaining DOCTYPE, html, head, or body tags that might be left
|
|
||||||
// Do this in a way that doesn't create nested matches
|
|
||||||
let previousLength = 0; |
|
||||||
let iterations = 0; |
|
||||||
while (iterations < 10 && cleaned.length !== previousLength) { |
|
||||||
previousLength = cleaned.length; |
|
||||||
cleaned = cleaned.replace(/<!DOCTYPE[^>]*>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<html[^>]*>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<\/html>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<body[^>]*>/gi, ''); |
|
||||||
cleaned = cleaned.replace(/<\/body>/gi, ''); |
|
||||||
cleaned = cleaned.trim(); |
|
||||||
iterations++; |
|
||||||
} |
|
||||||
|
|
||||||
// Detect and prevent content duplication (doom loops)
|
|
||||||
// Strategy: Use a fingerprint of the first part of the content to detect repetition
|
|
||||||
|
|
||||||
// Create a fingerprint from the first meaningful chunk (skip leading whitespace/tags)
|
|
||||||
const contentStart = cleaned.search(/[^\s<]/); |
|
||||||
if (contentStart !== -1) { |
|
||||||
// Use first 2000 characters as fingerprint, or 1/4 of content, whichever is smaller
|
|
||||||
const fingerprintLength = Math.min(2000, Math.max(500, Math.floor(cleaned.length / 4))); |
|
||||||
const fingerprint = cleaned.substring(contentStart, contentStart + fingerprintLength); |
|
||||||
|
|
||||||
// Find where this fingerprint repeats
|
|
||||||
const secondOccurrence = cleaned.indexOf(fingerprint, contentStart + fingerprintLength); |
|
||||||
|
|
||||||
if (secondOccurrence !== -1 && secondOccurrence < cleaned.length * 0.85) { |
|
||||||
// Content is clearly duplicated - return only the first occurrence
|
|
||||||
cleaned = cleaned.substring(0, secondOccurrence).trim(); |
|
||||||
return cleaned; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Additional check: detect repeated patterns using common document markers
|
|
||||||
const documentMarkers = [ |
|
||||||
/#\s+Markdown\s+Test\s+Document/gi, |
|
||||||
/==\s+Bullet\s+list/gi, |
|
||||||
/##\s+Bullet\s+list/gi, |
|
||||||
]; |
|
||||||
|
|
||||||
for (const marker of documentMarkers) { |
|
||||||
const matches = cleaned.match(marker); |
|
||||||
if (matches && matches.length > 1) { |
|
||||||
const firstMatch = cleaned.search(marker); |
|
||||||
if (firstMatch !== -1) { |
|
||||||
// Get a chunk starting from this marker
|
|
||||||
const chunkStart = firstMatch; |
|
||||||
const chunkLength = Math.min(1500, Math.floor(cleaned.length / 3)); |
|
||||||
const chunk = cleaned.substring(chunkStart, chunkStart + chunkLength); |
|
||||||
|
|
||||||
// Find where this chunk repeats
|
|
||||||
const secondChunk = cleaned.indexOf(chunk, chunkStart + chunkLength); |
|
||||||
|
|
||||||
if (secondChunk !== -1 && secondChunk < cleaned.length * 0.9) { |
|
||||||
// Content repeats here - truncate
|
|
||||||
cleaned = cleaned.substring(0, secondChunk).trim(); |
|
||||||
return cleaned; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Final check: detect repeated section headers
|
|
||||||
const sectionHeaderPattern = /(?:^|\n)(?:##?|==)\s+[^\n<]+/gm; |
|
||||||
const sectionHeaders: string[] = []; |
|
||||||
let match; |
|
||||||
|
|
||||||
while ((match = sectionHeaderPattern.exec(cleaned)) !== null) { |
|
||||||
sectionHeaders.push(match[0].trim()); |
|
||||||
} |
|
||||||
|
|
||||||
// If we have many headers, check for repetition
|
|
||||||
if (sectionHeaders.length > 8) { |
|
||||||
const uniqueHeaders = new Set(sectionHeaders); |
|
||||||
// If we have way more headers than unique ones, content is repeating
|
|
||||||
if (sectionHeaders.length > uniqueHeaders.size * 2.5) { |
|
||||||
// Find the first occurrence of each unique header
|
|
||||||
const uniqueHeaderArray = Array.from(uniqueHeaders); |
|
||||||
const firstUniqueHeader = uniqueHeaderArray[0]; |
|
||||||
const firstHeaderIndex = cleaned.indexOf(firstUniqueHeader); |
|
||||||
|
|
||||||
if (firstHeaderIndex !== -1) { |
|
||||||
// Find the second occurrence of the first header
|
|
||||||
const secondHeaderIndex = cleaned.indexOf(firstUniqueHeader, firstHeaderIndex + 200); |
|
||||||
|
|
||||||
if (secondHeaderIndex !== -1 && secondHeaderIndex < cleaned.length * 0.85) { |
|
||||||
// Content repeats here - truncate
|
|
||||||
cleaned = cleaned.substring(0, secondHeaderIndex).trim(); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
return cleaned; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Escape HTML special characters |
|
||||||
*/ |
|
||||||
export function escapeHtml(text: string): string { |
|
||||||
const map: Record<string, string> = { |
|
||||||
'&': '&', |
|
||||||
'<': '<', |
|
||||||
'>': '>', |
|
||||||
'"': '"', |
|
||||||
"'": ''', |
|
||||||
}; |
|
||||||
return text.replace(/[&<>"']/g, (m) => map[m]); |
|
||||||
} |
|
||||||
Loading…
Reference in new issue