Browse Source

parse asciidoc

master
Silberengel 2 weeks ago
parent
commit
e0abef84ab
  1. 1
      .gitignore
  2. 62
      asciidoc_testdoc.adoc
  3. 72
      debug-asciidoc-output.adoc
  4. 27
      debug-lists.ts
  5. 55
      example.js
  6. 2
      generate-test-report.d.ts
  7. 1
      generate-test-report.d.ts.map
  8. 91
      generate-test-report.js
  9. 1
      generate-test-report.js.map
  10. 71
      generate-test-report.ts
  11. 29
      jest.config.js
  12. 63
      markdown_testdoc.md
  13. 4
      package.json
  14. 353
      src/__tests__/asciidoc.test.ts
  15. 238
      src/__tests__/parser.test.ts
  16. 692
      src/converters/to-asciidoc.js
  17. 330
      src/converters/to-asciidoc.ts
  18. 70
      src/detector.js
  19. 101
      src/detector.ts
  20. 160
      src/extractors/frontmatter.js
  21. 177
      src/extractors/frontmatter.ts
  22. 243
      src/extractors/metadata.js
  23. 396
      src/extractors/metadata.ts
  24. 92
      src/parser.js
  25. 233
      src/parser.ts
  26. 481
      src/post-processor.ts
  27. 175
      src/pre-processor.ts
  28. 148
      src/processors/asciidoc.js
  29. 209
      src/processors/asciidoc.ts
  30. 693
      src/processors/html-postprocess.js
  31. 599
      src/processors/html-postprocess.ts
  32. 239
      src/processors/html-utils.js
  33. 164
      src/processors/html-utils.ts
  34. 93
      src/processors/markdown.ts
  35. 143
      src/processors/music.js
  36. 152
      src/processors/music.ts
  37. 14
      src/types.js
  38. 15
      src/types.ts
  39. 20
      src/types/asciidoctor.d.ts
  40. 732
      src/utils/report-generator.ts
  41. 427
      test-parser-report.test.ts
  42. 10144
      test-report.html
  43. 4
      tsconfig.json
  44. 10
      tsconfig.test.json

1
.gitignore vendored

@ -20,6 +20,7 @@ node_modules/ @@ -20,6 +20,7 @@ node_modules/
package-lock.json
dist/
*.log
test-output/
# IDE
.idea/

62
asciidoc_testdoc.adoc

@ -112,41 +112,49 @@ link:https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-dr @@ -112,41 +112,49 @@ link:https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-dr
this should render as plaintext: `http://www.example.com`
this should be a hyperlink: www.example.com
this should be a hyperlink to the http URL with the same address, so wss://theforest.nostr1.com should render like link:wss://theforest.nostr1.com[https://theforest.nostr1.com]
this should be a hyperlink to the http URL with the same address link:https://theforest.nostr1.com[wss://theforest.nostr1.com]
=== Images
https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
image::https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png[test image, width=100%]
image::https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png[width=400]
=== Media
==== YouTube
https://youtube.com/shorts/ZWfvChb-i0w
Normal
https://www.youtube.com/watch?v=KGIAS0cslSU
https://youtu.be/KGIAS0cslSU
video::KGIAS0cslSU[youtube]
Shorts
https://www.youtube.com/shorts/s-BQhXdCs8Y
link:https://youtube.com/shorts/ZWfvChb-i0w[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Youtube link with pic]]
video::s-BQhXdCs8Y[youtube]
==== Spotify
https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ
https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx
link:https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Spotify link with pic]]
link:https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx[]
==== Audio
https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3
link:https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Audio link with pic]]
audio::https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3[]
==== Video
https://v.nostr.build/MTjaYib4upQuf8zn.mp4
link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Video link with pic]]
video::https://v.nostr.build/MTjaYib4upQuf8zn.mp4[]
== Tables
@ -155,8 +163,12 @@ link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.o @@ -155,8 +163,12 @@ link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.o
[cols="1,2"]
|===
|Syntax|Description
|Header|Title
|Paragraph|Text
|Header
|Title
|Paragraph
|Text
|===
=== Unorderly
@ -164,8 +176,12 @@ link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.o @@ -164,8 +176,12 @@ link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.o
[cols="1,2"]
|===
|Syntax|Description
|Header|Title
|Paragraph|Text
|Header
|Title
|Paragraph
|Text
|===
=== With alignment
@ -173,8 +189,14 @@ link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.o @@ -173,8 +189,14 @@ link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.o
[cols="<,^,>"]
|===
|Syntax|Description|Test Text
|Header|Title|Here's this
|Paragraph|Text|And more
|Header
|Title
|Here's this
|Paragraph
|Text
|And more
|===
== Code blocks
@ -251,7 +273,7 @@ $$ @@ -251,7 +273,7 @@ $$
=== ABC Notation
[source,abc]
[abc]
----
X:1
T:Ohne Titel
@ -270,7 +292,7 @@ dd d2 | ee e2 | fg ad | ed/c/ d2 :| @@ -270,7 +292,7 @@ dd d2 | ee e2 | fg ad | ed/c/ d2 :|
=== PlantUML
[source,plantuml]
[plantuml]
----
@startuml
Alice -> Bob: Authentication Request
@ -280,7 +302,7 @@ Bob --> Alice: Authentication Response @@ -280,7 +302,7 @@ Bob --> Alice: Authentication Response
=== BPMN
[source,plantuml]
[plantuml]
----
@startbpmn
start
@ -306,7 +328,7 @@ Here's a simple footnote,footnote:[This is the first footnote.] and here's a lon @@ -306,7 +328,7 @@ Here's a simple footnote,footnote:[This is the first footnote.] and here's a lon
== Anchor links
<<bullet-list,Link to bullet list section>>
<<_bullet_list,Link to bullet list section>>
== Formatting

72
debug-asciidoc-output.adoc

@ -1,72 +0,0 @@ @@ -1,72 +0,0 @@
This is a test unordered list with mixed bullets:
* First item with a number 2. in it
* Second item
* Third item
* Indented item
* Indented item
* Fourth item
Another unordered list:
* 1st item
* 2nd item
* third item containing _italic_ text
* indented item
* second indented item
* fourth item
This is a test ordered list with indented items:
. First item
. Second item
. Third item
. Indented item
. Indented item
. Fourth item
Ordered list where everything has the same number:
. First item
. Second item
. Third item
. Fourth item
Ordered list that is wrongly numbered:
. First item
. Second item
. Third item
. Fourth item
This is a mixed list with indented items:
. First item
. Second item
. Third item
* Indented item
* Indented item
. Fourth item
This is another mixed list with indented items:
* First item
* Second item
* Third item
. Indented item
. Indented item
* Fourth item

27
debug-lists.ts

@ -1,27 +0,0 @@ @@ -1,27 +0,0 @@
import { convertToAsciidoc } from './src/converters/to-asciidoc';
import { detectFormat } from './src/detector';
import * as fs from 'fs';
import * as path from 'path';
// Read just the list section from markdown test doc
const markdownContent = fs.readFileSync(
path.join(__dirname, 'markdown_testdoc.md'),
'utf-8'
);
// Extract just the list sections
const listSection = markdownContent.split('## Bullet list')[1]?.split('##')[0] || markdownContent;
console.log('=== ORIGINAL MARKDOWN ===');
console.log(listSection);
console.log('\n=== DETECTED FORMAT ===');
const format = detectFormat(listSection);
console.log(format);
console.log('\n=== CONVERTED ASCIIDOC ===');
const asciidoc = convertToAsciidoc(listSection, format, '', {});
console.log(asciidoc);
// Write to file for inspection
fs.writeFileSync(path.join(__dirname, 'debug-asciidoc-output.adoc'), asciidoc);
console.log('\n=== Written to debug-asciidoc-output.adoc ===');

55
example.js

@ -1,55 +0,0 @@ @@ -1,55 +0,0 @@
#!/usr/bin/env node
/**
* Example usage of gc-parser
* This can be called from Go or used directly in Node.js
*/
const { Parser, defaultOptions } = require('./dist/index.js');
async function main() {
// Create parser with default options
const opts = defaultOptions();
opts.linkBaseURL = process.env.LINK_BASE_URL || 'https://example.com';
const parser = new Parser(opts);
// Get content from command line argument or stdin
let content = '';
if (process.argv[2]) {
content = process.argv[2];
} else {
// Read from stdin
const readline = require('readline');
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
for await (const line of rl) {
content += line + '\n';
}
}
if (!content) {
console.error('No content provided');
process.exit(1);
}
try {
const result = await parser.process(content);
// Output as JSON for easy parsing
console.log(JSON.stringify(result, null, 2));
} catch (error) {
console.error('Error processing content:', error);
process.exit(1);
}
}
if (require.main === module) {
main();
}
module.exports = { main };

2
generate-test-report.d.ts vendored

@ -1,2 +0,0 @@ @@ -1,2 +0,0 @@
export {};
//# sourceMappingURL=generate-test-report.d.ts.map

1
generate-test-report.d.ts.map

@ -1 +0,0 @@ @@ -1 +0,0 @@
{"version":3,"file":"generate-test-report.d.ts","sourceRoot":"","sources":["generate-test-report.ts"],"names":[],"mappings":""}

91
generate-test-report.js

@ -1,91 +0,0 @@ @@ -1,91 +0,0 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const parser_1 = require("./src/parser");
const report_generator_1 = require("./src/utils/report-generator");
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
/**
* Standalone script to generate HTML test report
* Run with: npm run test:report
*/
async function main() {
console.log('📝 Generating test report...\n');
// Initialize parser
const parser = new parser_1.Parser({
linkBaseURL: 'https://example.com',
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/notes?t={topic}',
});
// Read test documents
const markdownPath = path.join(__dirname, 'markdown_testdoc.md');
const asciidocPath = path.join(__dirname, 'asciidoc_testdoc.adoc');
if (!fs.existsSync(markdownPath)) {
console.error(`❌ Error: ${markdownPath} not found`);
process.exit(1);
}
if (!fs.existsSync(asciidocPath)) {
console.error(`❌ Error: ${asciidocPath} not found`);
process.exit(1);
}
const markdownContent = fs.readFileSync(markdownPath, 'utf-8');
const asciidocContent = fs.readFileSync(asciidocPath, 'utf-8');
console.log('📄 Parsing markdown document...');
const markdownResult = await parser.process(markdownContent);
console.log('📄 Parsing asciidoc document...');
const asciidocResult = await parser.process(asciidocContent);
console.log('🎨 Generating HTML report...');
const htmlReport = (0, report_generator_1.generateHTMLReport)({
markdown: {
original: markdownContent,
result: markdownResult,
},
asciidoc: {
original: asciidocContent,
result: asciidocResult,
},
});
// Write HTML report to file
const reportPath = path.join(__dirname, 'test-report.html');
fs.writeFileSync(reportPath, htmlReport, 'utf-8');
console.log(`\n✅ Test report generated: ${reportPath}`);
console.log(` Open this file in your browser to view the results.\n`);
}
// Run the script
main().catch((error) => {
console.error('❌ Error generating test report:', error);
process.exit(1);
});
//# sourceMappingURL=generate-test-report.js.map

1
generate-test-report.js.map

@ -1 +0,0 @@ @@ -1 +0,0 @@
{"version":3,"file":"generate-test-report.js","sourceRoot":"","sources":["generate-test-report.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,yCAAsC;AACtC,mEAA8E;AAC9E,uCAAyB;AACzB,2CAA6B;AAE7B;;;GAGG;AAEH,KAAK,UAAU,IAAI;IACjB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;IAE9C,oBAAoB;IACpB,MAAM,MAAM,GAAG,IAAI,eAAM,CAAC;QACxB,WAAW,EAAE,qBAAqB;QAClC,WAAW,EAAE,kBAAkB;QAC/B,UAAU,EAAE,kBAAkB;KAC/B,CAAC,CAAC;IAEH,sBAAsB;IACtB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,qBAAqB,CAAC,CAAC;IACjE,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,uBAAuB,CAAC,CAAC;IAEnE,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,YAAY,YAAY,YAAY,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,YAAY,YAAY,YAAY,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,eAAe,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAC/D,MAAM,eAAe,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAE/D,OAAO,CAAC,GAAG,CAAC,iCAAiC,CAAC,CAAC;IAC/C,MAAM,cAAc,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7D,OAAO,CAAC,GAAG,CAAC,iCAAiC,CAAC,CAAC;IAC/C,MAAM,cAAc,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7D,OAAO,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC;IAC5C,MAAM,UAAU,GAAG,IAAA,qCAAkB,EAAC;QACpC,QAAQ,EAAE;YACR,QAAQ,EAAE,eAAe;YACzB,MAAM,EAAE,cAAc;SACvB;QACD,QAAQ,EAAE;YACR,QAAQ,EAAE,eAAe;YACzB,MAAM,EAAE,cAAc;SACvB;KACF,CAAC,CAAC;IAEH,4BAA4B;IAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,kBAAkB,CAAC,CAAC;IAC5D,EAAE,CAAC,aAAa,CAAC,UAAU,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC;IAElD,OAAO,CAAC,GAAG,CAAC,8BAA8B,UAAU,EAAE,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;AAC1E,CAAC;AAED,iBAAiB;AACjB,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACrB,OAAO,CAAC,KAAK,CAAC,iCAAiC,EAAE,KAAK,CAAC,CAAC;IACxD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}

71
generate-test-report.ts

@ -1,71 +0,0 @@ @@ -1,71 +0,0 @@
// Import from source files - this script should be run with ts-node or similar
// from the project root, not from dist/
import { Parser } from './src/parser';
import { generateHTMLReport } from './src/utils/report-generator';
import * as fs from 'fs';
import * as path from 'path';
/**
* Standalone script to generate HTML test report
* Run with: npm run test:report
*/
async function main() {
console.log('📝 Generating test report...\n');
// Initialize parser
const parser = new Parser({
linkBaseURL: 'https://example.com',
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/notes?t={topic}',
});
// Read test documents from project root
const baseDir = __dirname.includes('dist') ? path.join(__dirname, '..') : __dirname;
const markdownPath = path.join(baseDir, 'markdown_testdoc.md');
const asciidocPath = path.join(baseDir, 'asciidoc_testdoc.adoc');
if (!fs.existsSync(markdownPath)) {
console.error(`❌ Error: ${markdownPath} not found`);
process.exit(1);
}
if (!fs.existsSync(asciidocPath)) {
console.error(`❌ Error: ${asciidocPath} not found`);
process.exit(1);
}
const markdownContent = fs.readFileSync(markdownPath, 'utf-8');
const asciidocContent = fs.readFileSync(asciidocPath, 'utf-8');
console.log('📄 Parsing markdown document...');
const markdownResult = await parser.process(markdownContent);
console.log('📄 Parsing asciidoc document...');
const asciidocResult = await parser.process(asciidocContent);
console.log('🎨 Generating HTML report...');
const htmlReport = generateHTMLReport({
markdown: {
original: markdownContent,
result: markdownResult,
},
asciidoc: {
original: asciidocContent,
result: asciidocResult,
},
});
// Write HTML report to file (adjust path based on where script is run from)
const reportPath = path.join(baseDir, 'test-report.html');
fs.writeFileSync(reportPath, htmlReport, 'utf-8');
console.log(`\n✅ Test report generated: ${reportPath}`);
console.log(` Open this file in your browser to view the results.\n`);
}
// Run the script
main().catch((error) => {
console.error('❌ Error generating test report:', error);
process.exit(1);
});

29
jest.config.js

@ -1,24 +1,23 @@ @@ -1,24 +1,23 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
roots: ['<rootDir>'],
testMatch: ['**/*.test.ts'],
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
collectCoverageFrom: [
'src/**/*.ts',
'!src/**/*.d.ts',
],
roots: ['<rootDir>/src'],
testMatch: ['**/__tests__/**/*.test.ts', '**/?(*.)+(spec|test).ts'],
testPathIgnorePatterns: ['/node_modules/', '/dist/', 'asciidoc.test.ts'],
transform: {
'^.+\\.ts$': ['ts-jest', {
tsconfig: 'tsconfig.test.json',
tsconfig: {
esModuleInterop: true,
},
}],
'^.+\\.js$': 'babel-jest',
},
// Don't transform AsciiDoctor packages - they use Opal runtime which breaks with Jest transformation
// AsciiDoctor uses CommonJS and Opal runtime, so we need to exclude it from transformation
// The pattern matches paths to ignore (not transform)
transformIgnorePatterns: [
'node_modules/(?!(@asciidoctor)/)',
moduleFileExtensions: ['ts', 'js', 'json'],
moduleNameMapper: {
'^marked$': '<rootDir>/node_modules/marked/lib/marked.umd.js',
},
collectCoverageFrom: [
'src/**/*.ts',
'!src/**/*.d.ts',
],
// Ensure CommonJS modules are handled correctly
moduleNameMapper: {},
};

63
markdown_testdoc.md

@ -16,6 +16,7 @@ array: @@ -16,6 +16,7 @@ array:
## Bullet list
This is a test unordered list with mixed bullets:
* First item with a number 2. in it
* Second item
* Third item
@ -24,6 +25,7 @@ This is a test unordered list with mixed bullets: @@ -24,6 +25,7 @@ This is a test unordered list with mixed bullets:
* Fourth item
Another unordered list:
- 1st item
- 2nd item
- third item containing _italic_ text
@ -32,6 +34,7 @@ Another unordered list: @@ -32,6 +34,7 @@ Another unordered list:
- fourth item
This is a test ordered list with indented items:
1. First item
2. Second item
3. Third item
@ -39,19 +42,15 @@ This is a test ordered list with indented items: @@ -39,19 +42,15 @@ This is a test ordered list with indented items:
2. Indented item
4. Fourth item
Ordered list where everything has the same number:
1. First item
1. Second item
1. Third item
1. Fourth item
Ordered list that is wrongly numbered:
1. First item
8. Second item
3. Third item
5. Fourth item
This is a mixed list with indented items:
1. First item
2. Second item
3. Third item
@ -60,6 +59,7 @@ This is a mixed list with indented items: @@ -60,6 +59,7 @@ This is a mixed list with indented items:
4. Fourth item
This is another mixed list with indented items:
- First item
- Second item
- Third item
@ -67,7 +67,6 @@ This is another mixed list with indented items: @@ -67,7 +67,6 @@ This is another mixed list with indented items:
2. Indented item
- Fourth item
## Headers
### Third-level header
@ -116,13 +115,11 @@ https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte- @@ -116,13 +115,11 @@ https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-
this should render as plaintext: `http://www.example.com`
this should be a hyperlink: www.example.com
this shouild be a hyperlink to the http URL with the same address, so wss://theforest.nostr1.com should render like [wss://theforest.nostr1.com](https://theforest.nostr1.com)
this shouild be a hyperlink to the http URL with the same address [wss://theforest.nostr1.com](https://theforest.nostr1.com)
### Images
Image: https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
![test image](https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png)
@ -132,25 +129,25 @@ Image: https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png @@ -132,25 +129,25 @@ Image: https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
https://youtube.com/shorts/ZWfvChb-i0w
[![Youtube link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://youtube.com/shorts/ZWfvChb-i0w)
![Youtube link](https://youtube.com/shorts/ZWfvChb-i0w)
#### Spotify
https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ
[![Spotify link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ)
![Spotify link](https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ)
#### Audio
https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3
[![Audio link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3)
![Audio link](https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3)
#### Video
https://v.nostr.build/MTjaYib4upQuf8zn.mp4
[![Video link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://v.nostr.build/MTjaYib4upQuf8zn.mp4)
![Video link](https://v.nostr.build/MTjaYib4upQuf8zn.mp4)
## Tables
@ -232,34 +229,6 @@ M = @@ -232,34 +229,6 @@ M =
$$
```
```latex
$$
f(x)=
\begin{cases}
1/d_{ij} & \quad \text{when $d_{ij} \leq 160$}\\
0 & \quad \text{otherwise}
\end{cases}
$$
```
### ABC Notation
```abc
X:1
T:Ohne Titel
C:Aufgezeichnet 1784
A:Seibis nahe Lichtenberg in Oberfranken
S:Handschrift, bezeichnet und datiert: "Heinrich Nicol Philipp zu Seibis den 30 Junius 1784"
M:4/4
L:1/4
K:D
dd d2 | ee e2 | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
|:\
fg ad | cB cA | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
```
## LateX
### LaTex in inline-code
@ -280,7 +249,7 @@ Here's a simple footnote,[^1] and here's a longer one.[^bignote] @@ -280,7 +249,7 @@ Here's a simple footnote,[^1] and here's a longer one.[^bignote]
## Anchor links
[Link to bullet list section](#bullet-lists)
[Link to bullet list section](#bullet-list)
## Formatting
@ -290,11 +259,7 @@ Here's a simple footnote,[^1] and here's a longer one.[^bignote] @@ -290,11 +259,7 @@ Here's a simple footnote,[^1] and here's a longer one.[^bignote]
### Bold
This is *bold* text. So is this **bold** text.
### Italic
This is _italic_ text. So is this __italic__ text.
This is *italic* text. So is this **bold** text.
### Task List

4
package.json

@ -7,6 +7,8 @@ @@ -7,6 +7,8 @@
"scripts": {
"build": "tsc",
"test": "jest",
"test:asciidoc": "ts-node src/__tests__/asciidoc.test.ts",
"test:all": "npm run test && npm run test:asciidoc",
"test:report": "ts-node generate-test-report.ts",
"prepublishOnly": "npm run build"
},
@ -23,6 +25,8 @@ @@ -23,6 +25,8 @@
"license": "MIT",
"dependencies": {
"@asciidoctor/core": "^3.0.4",
"@types/marked": "^5.0.2",
"marked": "^17.0.3",
"node-emoji": "^2.2.0"
},
"devDependencies": {

353
src/__tests__/asciidoc.test.ts

@ -0,0 +1,353 @@ @@ -0,0 +1,353 @@
import { Parser } from '../parser';
import { readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join } from 'path';
/**
* Simple test runner for AsciiDoc tests (separate from Jest due to Opal compatibility issues)
*/
async function runAsciiDocTests() {
console.log('Running AsciiDoc tests...\n');
const asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8');
const parser = new Parser({
linkBaseURL: 'https://example.com',
enableNostrAddresses: true,
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/hashtag/{topic}'
});
let passed = 0;
let failed = 0;
const failures: string[] = [];
const testPromises: Promise<void>[] = [];
function test(name: string, fn: () => void | Promise<void>) {
const testPromise = (async () => {
try {
const result = fn();
if (result instanceof Promise) {
await result;
}
passed++;
console.log(`${name}`);
} catch (error: any) {
failed++;
failures.push(`${name}: ${error.message}`);
console.error(`${name}: ${error.message}`);
}
})();
testPromises.push(testPromise);
}
function expect(actual: any) {
return {
toBeDefined: () => {
if (actual === undefined || actual === null) {
throw new Error(`Expected value to be defined, but got ${actual}`);
}
},
toBe: (expected: any) => {
if (actual !== expected) {
throw new Error(`Expected ${expected}, but got ${actual}`);
}
},
toContain: (substring: string) => {
if (typeof actual === 'string' && !actual.includes(substring)) {
throw new Error(`Expected string to contain "${substring}"`);
}
},
toMatch: (regex: RegExp) => {
if (typeof actual === 'string' && !regex.test(actual)) {
throw new Error(`Expected string to match ${regex}`);
}
},
toHaveProperty: (prop: string) => {
if (!(prop in actual)) {
throw new Error(`Expected object to have property "${prop}"`);
}
},
toBeGreaterThan: (value: number) => {
if (typeof actual !== 'number' || actual <= value) {
throw new Error(`Expected ${actual} to be greater than ${value}`);
}
},
length: {
toBeGreaterThan: (value: number) => {
if (!Array.isArray(actual) || actual.length <= value) {
throw new Error(`Expected array length to be greater than ${value}, but got ${actual.length}`);
}
}
}
};
}
// Run tests
const result = await parser.process(asciidocContent);
// Write HTML output to file for inspection
const outputDir = join(__dirname, '../../test-output');
try {
mkdirSync(outputDir, { recursive: true });
} catch (e) {
// Directory might already exist
}
const htmlOutput = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="referrer" content="strict-origin-when-cross-origin">
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'unsafe-inline' 'unsafe-eval' https://www.youtube.com https://s.ytimg.com https://www.gstatic.com https://*.googlevideo.com; frame-src https://www.youtube.com https://youtube.com https://open.spotify.com https://*.googlevideo.com; style-src 'unsafe-inline'; img-src 'self' data: https:; media-src 'self' https:; connect-src https:; child-src https://www.youtube.com https://youtube.com;">
<title>AsciiDoc Test Output</title>
<style>
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; }
.hashtag { color: #1da1f2; font-weight: 500; }
.wikilink { color: #0066cc; text-decoration: underline; }
.nostr-link { color: #8b5cf6; text-decoration: underline; }
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; }
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; }
.line-through { text-decoration: line-through; }
.highlight { background-color: #ffeb3b; padding: 2px 4px; border-radius: 3px; }
.bare-image { max-width: 100%; width: auto; height: auto; margin: 10px 0; display: block; }
.bare-video, .bare-audio { width: 100%; max-width: 800px; margin: 10px 0; display: block; }
.youtube-embed, .spotify-embed { max-width: 100%; margin: 10px 0; border-radius: 8px; display: block; }
.youtube-embed { width: 100%; max-width: 640px; height: auto; aspect-ratio: 16/9; border: 0; display: block; }
.spotify-embed { width: 100%; max-width: 800px; }
/* Table styles */
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
table thead { background-color: #f2f2f2; }
table th { font-weight: bold; padding: 8px; border: 1px solid #ddd; background-color: #f2f2f2; }
table td { padding: 8px; border: 1px solid #ddd; }
/* Alignment classes - AsciiDoc uses halign-* and valign-* classes */
.halign-left { text-align: left !important; }
.halign-center { text-align: center !important; }
.halign-right { text-align: right !important; }
.valign-top { vertical-align: top !important; }
.valign-middle { vertical-align: middle !important; }
.valign-bottom { vertical-align: bottom !important; }
/* Also handle tableblock classes */
.tableblock.halign-left { text-align: left !important; }
.tableblock.halign-center { text-align: center !important; }
.tableblock.halign-right { text-align: right !important; }
.tableblock.valign-top { vertical-align: top !important; }
.tableblock.valign-middle { vertical-align: middle !important; }
.tableblock.valign-bottom { vertical-align: bottom !important; }
/* Task list styles */
.checklist { list-style: none; padding-left: 0; }
.checklist li { padding-left: 1.5em; position: relative; margin: 0.5em 0; }
.checklist li i.fa-check-square-o::before { content: "☑ "; font-style: normal; font-family: sans-serif; }
.checklist li i.fa-square-o::before { content: "☐ "; font-style: normal; font-family: sans-serif; }
.checklist li i { position: absolute; left: 0; font-style: normal; }
/* Fallback if Font Awesome doesn't load */
.checklist li i.fa-check-square-o { display: inline-block; width: 1em; }
.checklist li i.fa-check-square-o:before { content: "☑"; }
.checklist li i.fa-square-o { display: inline-block; width: 1em; }
.checklist li i.fa-square-o:before { content: "☐"; }
/* AsciiDoc specific styles */
.sect1, .sect2, .sect3, .sect4, .sect5 { margin-top: 1.5em; margin-bottom: 1em; }
.paragraph { margin: 1em 0; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
table th { background-color: #f2f2f2; }
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; }
</style>
</head>
<body>
<h1>AsciiDoc Test Document - Parsed Output</h1>
<hr>
${result.content}
<hr>
<h2>Metadata</h2>
<pre>${JSON.stringify({
hasLaTeX: result.hasLaTeX,
hasMusicalNotation: result.hasMusicalNotation,
nostrLinks: result.nostrLinks,
wikilinks: result.wikilinks,
hashtags: result.hashtags,
links: result.links,
media: result.media
}, null, 2)}</pre>
</body>
</html>`;
const outputPath = join(outputDir, 'asciidoc-output.html');
writeFileSync(outputPath, htmlOutput, 'utf-8');
console.log(`\n📄 HTML output written to: ${outputPath}\n`);
test('should parse AsciiDoc content', () => {
expect(result).toBeDefined();
expect(result.content).toBeDefined();
expect(typeof result.content).toBe('string');
expect(result.content.length).toBeGreaterThan(0);
});
test('should have HTML content', () => {
expect(result.content).toContain('<');
expect(result.content).toContain('>');
});
test('should extract table of contents', () => {
expect(result.tableOfContents).toBeDefined();
expect(typeof result.tableOfContents).toBe('string');
});
test('should detect LaTeX', () => {
expect(result.hasLaTeX).toBeDefined();
expect(typeof result.hasLaTeX).toBe('boolean');
expect(result.hasLaTeX).toBe(true);
});
test('should detect musical notation', () => {
expect(result.hasMusicalNotation).toBeDefined();
expect(typeof result.hasMusicalNotation).toBe('boolean');
expect(result.hasMusicalNotation).toBe(true);
});
test('should extract nostr links', () => {
expect(result.nostrLinks).toBeDefined();
expect(Array.isArray(result.nostrLinks)).toBe(true);
expect(result.nostrLinks.length).toBeGreaterThan(0);
const nostrLink = result.nostrLinks[0];
expect(nostrLink).toHaveProperty('type');
expect(nostrLink).toHaveProperty('id');
expect(nostrLink).toHaveProperty('text');
expect(nostrLink).toHaveProperty('bech32');
const validTypes = ['npub', 'nprofile', 'nevent', 'naddr', 'note'];
if (!validTypes.includes(nostrLink.type)) {
throw new Error(`Invalid nostr type: ${nostrLink.type}`);
}
});
test('should extract wikilinks', () => {
expect(result.wikilinks).toBeDefined();
expect(Array.isArray(result.wikilinks)).toBe(true);
expect(result.wikilinks.length).toBeGreaterThan(0);
const wikilink = result.wikilinks[0];
expect(wikilink).toHaveProperty('dtag');
expect(wikilink).toHaveProperty('display');
expect(wikilink).toHaveProperty('original');
});
test('should extract hashtags', () => {
expect(result.hashtags).toBeDefined();
expect(Array.isArray(result.hashtags)).toBe(true);
expect(result.hashtags.length).toBeGreaterThan(0);
result.hashtags.forEach((tag: string) => {
if (tag.includes('#')) {
throw new Error(`Hashtag should not include #: ${tag}`);
}
});
});
test('should extract regular links', () => {
expect(result.links).toBeDefined();
expect(Array.isArray(result.links)).toBe(true);
if (result.links.length > 0) {
const link = result.links[0];
expect(link).toHaveProperty('url');
expect(link).toHaveProperty('text');
expect(link).toHaveProperty('isExternal');
expect(typeof link.isExternal).toBe('boolean');
}
});
test('should extract media URLs', () => {
expect(result.media).toBeDefined();
expect(Array.isArray(result.media)).toBe(true);
});
test('should process nostr: addresses in HTML', () => {
const nostrAddresses = result.nostrLinks;
expect(nostrAddresses.length).toBeGreaterThan(0);
nostrAddresses.forEach((link: any) => {
if (!result.content.includes(`data-nostr-type="${link.type}"`)) {
throw new Error(`Missing nostr type attribute for ${link.type}`);
}
if (!result.content.includes(`data-nostr-id="${link.bech32}"`)) {
throw new Error(`Missing nostr id attribute for ${link.bech32}`);
}
});
});
test('should process wikilinks in HTML', () => {
const wikilinks = result.wikilinks;
expect(wikilinks.length).toBeGreaterThan(0);
wikilinks.forEach((wikilink: any) => {
if (!result.content.includes(`class="wikilink"`)) {
throw new Error('Missing wikilink class');
}
if (!result.content.includes(`data-dtag="${wikilink.dtag}"`)) {
throw new Error(`Missing dtag attribute for ${wikilink.dtag}`);
}
});
});
test('should process hashtags in HTML', () => {
const hashtags = result.hashtags;
expect(hashtags.length).toBeGreaterThan(0);
hashtags.forEach((tag: string) => {
if (!result.content.includes(`data-topic="${tag}"`)) {
throw new Error(`Missing topic attribute for ${tag}`);
}
if (!result.content.includes('class="hashtag"')) {
throw new Error('Missing hashtag class');
}
});
});
test('should contain expected content sections', () => {
if (!/Bullet list|bullet/i.test(result.content)) {
throw new Error('Missing bullet list section');
}
if (!/Headers|header/i.test(result.content)) {
throw new Error('Missing headers section');
}
if (!/Media and Links|media|links/i.test(result.content)) {
throw new Error('Missing media and links section');
}
});
test('should return consistent structure', () => {
expect(result).toHaveProperty('content');
expect(result).toHaveProperty('tableOfContents');
expect(result).toHaveProperty('hasLaTeX');
expect(result).toHaveProperty('hasMusicalNotation');
expect(result).toHaveProperty('nostrLinks');
expect(result).toHaveProperty('wikilinks');
expect(result).toHaveProperty('hashtags');
expect(result).toHaveProperty('links');
expect(result).toHaveProperty('media');
});
// Wait for all tests to complete
await Promise.all(testPromises);
// Print summary
console.log(`\n${'='.repeat(50)}`);
console.log(`Tests passed: ${passed}`);
console.log(`Tests failed: ${failed}`);
if (failures.length > 0) {
console.log('\nFailures:');
failures.forEach(f => console.error(` - ${f}`));
process.exit(1);
} else {
console.log('\nAll tests passed!');
process.exit(0);
}
}
// Run tests
runAsciiDocTests().catch(error => {
console.error('Test runner error:', error);
process.exit(1);
});

238
src/__tests__/parser.test.ts

@ -0,0 +1,238 @@ @@ -0,0 +1,238 @@
import { Parser } from '../parser';
import { readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join } from 'path';
describe('Parser', () => {
let asciidocContent: string;
let markdownContent: string;
beforeAll(() => {
asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8');
markdownContent = readFileSync(join(__dirname, '../../markdown_testdoc.md'), 'utf-8');
});
// AsciiDoc tests are run separately using a Node.js script (asciidoc.test.ts)
// due to Jest/Opal runtime compatibility issues
// Run with: npm run test:asciidoc
describe('Markdown Test Document', () => {
let result: any;
beforeAll(async () => {
const parser = new Parser({
linkBaseURL: 'https://example.com',
enableNostrAddresses: true,
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/hashtag/{topic}'
});
result = await parser.process(markdownContent);
// Write HTML output to file for inspection
const outputDir = join(__dirname, '../../test-output');
try {
mkdirSync(outputDir, { recursive: true });
} catch (e) {
// Directory might already exist
}
const htmlOutput = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Markdown Test Output</title>
<style>
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; }
.hashtag { color: #1da1f2; font-weight: 500; }
.wikilink { color: #0066cc; text-decoration: underline; }
.nostr-link { color: #8b5cf6; text-decoration: underline; }
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; }
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; }
.bare-image, .bare-video, .bare-audio { max-width: 100%; margin: 10px 0; }
.bare-video, .bare-audio { width: 100%; max-width: 600px; }
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
table th { background-color: #f2f2f2; }
</style>
</head>
<body>
<h1>Markdown Test Document - Parsed Output</h1>
<hr>
${result.content}
<hr>
<h2>Metadata</h2>
<pre>${JSON.stringify({
frontmatter: result.frontmatter,
hasLaTeX: result.hasLaTeX,
hasMusicalNotation: result.hasMusicalNotation,
nostrLinks: result.nostrLinks,
wikilinks: result.wikilinks,
hashtags: result.hashtags,
links: result.links,
media: result.media
}, null, 2)}</pre>
</body>
</html>`;
const outputPath = join(outputDir, 'markdown-output.html');
writeFileSync(outputPath, htmlOutput, 'utf-8');
// Use console.info to ensure it shows in Jest output
console.info(`\n📄 HTML output written to: ${outputPath}\n`);
});
it('should parse Markdown content', () => {
expect(result).toBeDefined();
expect(result.content).toBeDefined();
expect(typeof result.content).toBe('string');
expect(result.content.length).toBeGreaterThan(0);
});
it('should have HTML content', () => {
expect(result.content).toContain('<');
expect(result.content).toContain('>');
});
it('should extract frontmatter', () => {
expect(result.frontmatter).toBeDefined();
expect(typeof result.frontmatter).toBe('object');
expect(result.frontmatter).toHaveProperty('author');
expect(result.frontmatter.author).toBe('James Smith');
expect(result.frontmatter).toHaveProperty('summary');
expect(result.frontmatter.summary).toBe('This is a summary');
});
it('should detect LaTeX', () => {
expect(result.hasLaTeX).toBeDefined();
expect(typeof result.hasLaTeX).toBe('boolean');
// The test doc has LaTeX, so it should be true
expect(result.hasLaTeX).toBe(true);
});
it('should detect musical notation', () => {
expect(result.hasMusicalNotation).toBeDefined();
expect(typeof result.hasMusicalNotation).toBe('boolean');
});
it('should extract nostr links', () => {
expect(result.nostrLinks).toBeDefined();
expect(Array.isArray(result.nostrLinks)).toBe(true);
expect(result.nostrLinks.length).toBeGreaterThan(0);
// Check that nostr: addresses are extracted
const nostrLink = result.nostrLinks[0];
expect(nostrLink).toHaveProperty('type');
expect(nostrLink).toHaveProperty('id');
expect(nostrLink).toHaveProperty('text');
expect(nostrLink).toHaveProperty('bech32');
expect(['npub', 'nprofile', 'nevent', 'naddr', 'note']).toContain(nostrLink.type);
});
it('should extract wikilinks', () => {
expect(result.wikilinks).toBeDefined();
expect(Array.isArray(result.wikilinks)).toBe(true);
expect(result.wikilinks.length).toBeGreaterThan(0);
// Check wikilink structure
const wikilink = result.wikilinks[0];
expect(wikilink).toHaveProperty('dtag');
expect(wikilink).toHaveProperty('display');
expect(wikilink).toHaveProperty('original');
});
it('should extract hashtags', () => {
expect(result.hashtags).toBeDefined();
expect(Array.isArray(result.hashtags)).toBe(true);
expect(result.hashtags.length).toBeGreaterThan(0);
// Hashtags should not include the # symbol
result.hashtags.forEach((tag: string) => {
expect(tag).not.toContain('#');
});
});
it('should extract regular links', () => {
expect(result.links).toBeDefined();
expect(Array.isArray(result.links)).toBe(true);
if (result.links.length > 0) {
const link = result.links[0];
expect(link).toHaveProperty('url');
expect(link).toHaveProperty('text');
expect(link).toHaveProperty('isExternal');
expect(typeof link.isExternal).toBe('boolean');
}
});
it('should extract media URLs', () => {
expect(result.media).toBeDefined();
expect(Array.isArray(result.media)).toBe(true);
});
it('should process nostr: addresses in HTML', () => {
// Check that nostr: addresses are converted to links
const nostrAddresses = result.nostrLinks;
expect(nostrAddresses.length).toBeGreaterThan(0);
// Check that HTML contains links for nostr addresses
nostrAddresses.forEach((link: any) => {
expect(result.content).toContain(`data-nostr-type="${link.type}"`);
expect(result.content).toContain(`data-nostr-id="${link.bech32}"`);
});
});
it('should process wikilinks in HTML', () => {
// Check that wikilinks are converted to links
const wikilinks = result.wikilinks;
expect(wikilinks.length).toBeGreaterThan(0);
wikilinks.forEach((wikilink: any) => {
expect(result.content).toContain(`class="wikilink"`);
expect(result.content).toContain(`data-dtag="${wikilink.dtag}"`);
});
});
it('should process hashtags in HTML', () => {
// Check that hashtags are processed
const hashtags = result.hashtags;
expect(hashtags.length).toBeGreaterThan(0);
hashtags.forEach((tag: string) => {
expect(result.content).toContain(`data-topic="${tag}"`);
expect(result.content).toMatch(new RegExp(`class="hashtag"`));
});
});
it('should contain expected content sections', () => {
// Check for some expected content from the test doc
expect(result.content).toMatch(/Bullet list|bullet/i);
expect(result.content).toMatch(/Headers|header/i);
expect(result.content).toMatch(/Media and Links|media|links/i);
});
it('should have empty table of contents for markdown', () => {
// Markdown doesn't generate TOC by default
expect(result.tableOfContents).toBeDefined();
expect(typeof result.tableOfContents).toBe('string');
});
});
describe('Result structure validation', () => {
it('should return consistent structure for Markdown', async () => {
const parser = new Parser();
const result = await parser.process(markdownContent);
// Check all required fields
expect(result).toHaveProperty('content');
expect(result).toHaveProperty('tableOfContents');
expect(result).toHaveProperty('hasLaTeX');
expect(result).toHaveProperty('hasMusicalNotation');
expect(result).toHaveProperty('nostrLinks');
expect(result).toHaveProperty('wikilinks');
expect(result).toHaveProperty('hashtags');
expect(result).toHaveProperty('links');
expect(result).toHaveProperty('media');
});
});
});

692
src/converters/to-asciidoc.js

@ -1,692 +0,0 @@ @@ -1,692 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.convertToAsciidoc = convertToAsciidoc;
const types_1 = require("../types");
// Import node-emoji if available (optional dependency)
let emoji;
try {
emoji = require('node-emoji');
}
catch (e) {
// node-emoji not available, emoji conversion will be skipped
emoji = null;
}
/**
* Clean URL by removing tracking parameters
* Based on jumble's cleanUrl function
*/
function cleanUrl(url) {
try {
const parsedUrl = new URL(url);
// List of tracking parameter prefixes and exact names to remove
const trackingParams = [
// Google Analytics & Ads
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
// Facebook
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
// Twitter/X
'twclid', 'twsrc',
// Microsoft/Bing
'msclkid', 'mc_cid', 'mc_eid',
// Adobe
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
// Mailchimp
'mc_cid', 'mc_eid',
// HubSpot
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
// Marketo
'mkt_tok',
// YouTube
'si', 'feature', 'kw', 'pp',
// Other common tracking
'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
// Mobile app tracking
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
// Amazon
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
// Affiliate tracking
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
// Social media share tracking
'share', 'shared', 'sharesource'
];
// Remove all tracking parameters
trackingParams.forEach(param => {
parsedUrl.searchParams.delete(param);
});
// Remove any parameter that starts with utm_ or _
Array.from(parsedUrl.searchParams.keys()).forEach(key => {
if (key.startsWith('utm_') || key.startsWith('_')) {
parsedUrl.searchParams.delete(key);
}
});
return parsedUrl.toString();
}
catch {
// If URL parsing fails, return original URL
return url;
}
}
/**
* Converts content to AsciiDoc format based on detected format
* This is the unified entry point - everything becomes AsciiDoc
*/
function convertToAsciidoc(content, format, linkBaseURL, options = {}) {
let asciidoc = '';
switch (format) {
case types_1.ContentFormat.AsciiDoc:
// For AsciiDoc content, ensure proper formatting
asciidoc = content.replace(/\\n/g, '\n');
// Ensure headers are on their own lines with proper spacing
asciidoc = asciidoc.replace(/(\S[^\n]*)\n(={1,6}\s+[^\n]+)/g, (_match, before, header) => {
return `${before}\n\n${header}`;
});
break;
case types_1.ContentFormat.Wikipedia:
asciidoc = convertWikipediaToAsciidoc(content);
break;
case types_1.ContentFormat.Markdown:
asciidoc = convertMarkdownToAsciidoc(content);
break;
case types_1.ContentFormat.Plain:
default:
asciidoc = convertPlainTextToAsciidoc(content);
break;
}
// Process special elements for all content types
// Process wikilinks
asciidoc = processWikilinks(asciidoc, linkBaseURL);
// Process nostr: addresses if enabled
if (options.enableNostrAddresses !== false) {
asciidoc = processNostrAddresses(asciidoc, linkBaseURL);
}
// Process media URLs in markdown links/images first (before converting to AsciiDoc)
// This ensures media URLs in [text](url) or ![alt](url) format are detected
asciidoc = processMediaUrlsInMarkdown(asciidoc);
// Process media URLs (YouTube, Spotify, video, audio files) - for bare URLs
asciidoc = processMediaUrls(asciidoc);
// Process bare URLs (convert to AsciiDoc links)
asciidoc = processBareUrls(asciidoc);
// Process hashtags (after URLs to avoid conflicts)
asciidoc = processHashtags(asciidoc);
return asciidoc;
}
/**
* Converts Wikipedia markup to AsciiDoc format
* Handles Wikipedia-style headings, links, and formatting
*/
function convertWikipediaToAsciidoc(content) {
let asciidoc = content.replace(/\\n/g, '\n');
// Convert Wikipedia headings: == Heading == to AsciiDoc == Heading
// Wikipedia uses == for level 2, === for level 3, etc.
// AsciiDoc uses = for title, == for level 1, === for level 2, etc.
// So Wikipedia level 2 (==) maps to AsciiDoc level 1 (==)
asciidoc = asciidoc.replace(/^(=+)\s+(.+?)\s+\1$/gm, (match, equals, heading) => {
const level = equals.length - 1; // Count = signs, subtract 1 for AsciiDoc mapping
const asciidocEquals = '='.repeat(level + 1); // AsciiDoc uses one more = for same level
return `${asciidocEquals} ${heading.trim()}`;
});
// Convert Wikipedia bold: ''text'' to AsciiDoc *text*
asciidoc = asciidoc.replace(/''([^']+)''/g, '*$1*');
// Convert Wikipedia italic: 'text' to AsciiDoc _text_
// Be careful not to match apostrophes in words
asciidoc = asciidoc.replace(/(^|[^'])'([^']+)'([^']|$)/g, '$1_$2_$3');
// Convert Wikipedia links: [[Page]] or [[Page|Display]] to wikilinks
// These will be processed by processWikilinks later, but we need to ensure
// they're in the right format. Wikipedia links are already in [[...]] format
// which matches our wikilink format, so they should work as-is.
// Convert Wikipedia external links: [URL text] to AsciiDoc link:URL[text]
asciidoc = asciidoc.replace(/\[(https?:\/\/[^\s\]]+)\s+([^\]]+)\]/g, 'link:$1[$2]');
asciidoc = asciidoc.replace(/\[(https?:\/\/[^\s\]]+)\]/g, 'link:$1[$1]');
// Convert Wikipedia lists (they use * or # similar to Markdown)
// This is handled similarly to Markdown, so we can reuse that logic
// But Wikipedia also uses : for definition lists and ; for term lists
// For now, we'll handle basic lists and let AsciiDoc handle the rest
// Convert horizontal rules: ---- to AsciiDoc '''
asciidoc = asciidoc.replace(/^----+$/gm, "'''");
return asciidoc;
}
/**
* Converts Markdown to AsciiDoc format
* Based on jumble's conversion patterns
*/
function convertMarkdownToAsciidoc(content) {
let asciidoc = content.replace(/\\n/g, '\n');
// Fix spacing issues (but be careful not to break links and images)
// Process these BEFORE converting links/images to avoid conflicts
asciidoc = asciidoc.replace(/`([^`\n]+)`\s*\(([^)]+)\)/g, '`$1` ($2)');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`([a-zA-Z0-9])/g, '$1 `$2` $3');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`\s*\(/g, '$1 `$2` (');
asciidoc = asciidoc.replace(/\)`([^`\n]+)`([a-zA-Z0-9])/g, ') `$1` $2');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2');
// Add space before == but not if it's part of a markdown link pattern
// Check that == is not immediately after ]( which would be a link
asciidoc = asciidoc.replace(/([a-zA-Z0-9])(?<!\]\()==/g, '$1 ==');
// Note: nostr: addresses are processed later in processNostrAddresses
// Convert headers
asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======');
asciidoc = asciidoc.replace(/^#{5}\s+(.+)$/gm, '===== $1 =====');
asciidoc = asciidoc.replace(/^#{4}\s+(.+)$/gm, '==== $1 ====');
asciidoc = asciidoc.replace(/^#{3}\s+(.+)$/gm, '=== $1 ===');
asciidoc = asciidoc.replace(/^#{2}\s+(.+)$/gm, '== $1 ==');
asciidoc = asciidoc.replace(/^#{1}\s+(.+)$/gm, '= $1 =');
asciidoc = asciidoc.replace(/^==\s+(.+?)\s+==$/gm, '== $1 ==');
asciidoc = asciidoc.replace(/\s==\s+([^=]+?)\s+==\s/g, ' == $1 == ');
// Convert emphasis
asciidoc = asciidoc.replace(/\*\*(.+?)\*\*/g, '*$1*'); // Bold
asciidoc = asciidoc.replace(/__(.+?)__/g, '*$1*'); // Bold
asciidoc = asciidoc.replace(/\*(.+?)\*/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/_(.+?)_/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/~~(.+?)~~/g, '[line-through]#$1#'); // Strikethrough
asciidoc = asciidoc.replace(/==(.+?)==/g, '[highlight]#$1#'); // Text highlighting (GFM)
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript
// Convert emoji shortcodes to Unicode (e.g., :tent: -> 🏕)
// Only convert if node-emoji is available
if (emoji && emoji.emojify) {
asciidoc = emoji.emojify(asciidoc);
}
// Convert code blocks (handle both \n and \r\n line endings)
// Special handling for diagram languages: latex, plantuml, puml, bpmn
asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => {
const trimmedCode = code.trim();
if (trimmedCode.length === 0)
return '';
const langLower = lang ? lang.toLowerCase() : '';
// If it's a latex code block, always treat as code (not math)
if (langLower === 'latex') {
return `[source,latex]\n----\n${trimmedCode}\n----`;
}
// Handle PlantUML diagrams
if (langLower === 'plantuml' || langLower === 'puml') {
// Check if it already has @startuml/@enduml or @startbpmn/@endbpmn
if (trimmedCode.includes('@start') || trimmedCode.includes('@end')) {
return `[plantuml]\n----\n${trimmedCode}\n----`;
}
// If not, wrap it in @startuml/@enduml
return `[plantuml]\n----\n@startuml\n${trimmedCode}\n@enduml\n----`;
}
// Handle BPMN diagrams (using PlantUML BPMN syntax)
if (langLower === 'bpmn') {
// Check if it already has @startbpmn/@endbpmn
if (trimmedCode.includes('@startbpmn') && trimmedCode.includes('@endbpmn')) {
return `[plantuml]\n----\n${trimmedCode}\n----`;
}
// If not, wrap it in @startbpmn/@endbpmn
return `[plantuml]\n----\n@startbpmn\n${trimmedCode}\n@endbpmn\n----`;
}
// Check if it's ABC notation (starts with X:)
if (!lang && /^X:\s*\d+/m.test(trimmedCode)) {
// ABC notation - keep as plain text block, will be processed by music processor
return `----\n${trimmedCode}\n----`;
}
const hasCodePatterns = /[{}();=<>]|function|class|import|export|def |if |for |while |return |const |let |var |public |private |static |console\.log/.test(trimmedCode);
const isLikelyText = /^[A-Za-z\s.,!?\-'"]+$/.test(trimmedCode) && trimmedCode.length > 50;
const hasTooManySpaces = (trimmedCode.match(/\s{3,}/g) || []).length > 3;
const hasMarkdownPatterns = /^#{1,6}\s|^\*\s|^\d+\.\s|^\>\s|^\|.*\|/.test(trimmedCode);
if ((!hasCodePatterns && trimmedCode.length > 100) || isLikelyText || hasTooManySpaces || hasMarkdownPatterns) {
return _match;
}
return `[source${lang ? ',' + lang : ''}]\n----\n${trimmedCode}\n----`;
});
// Handle inline code: LaTeX formulas in inline code should be rendered as math
// Pattern: `$formula$` should become $formula$ (math), not code
// Handle escaped brackets: `$[ ... \]$` and `$[\sqrt{...}\]$`
asciidoc = asciidoc.replace(/`(\$[^`]+\$)`/g, (match, formula) => {
// Extract the formula (remove the $ signs)
const mathContent = formula.slice(1, -1);
return `$${mathContent}$`; // Return as math, not code
});
asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`'); // Regular inline code
// Convert nested image links first: [![alt](img)](url) - image wrapped in link
// This must come before regular image processing
asciidoc = asciidoc.replace(/\[!\[([^\]]*)\]\(([^)]+?)\)\]\(([^)]+?)\)/g, (match, alt, imgUrl, linkUrl) => {
const cleanImgUrl = imgUrl.trim();
const cleanLinkUrl = linkUrl.trim();
const cleanAlt = alt.trim();
// Check if linkUrl is a media URL
if (cleanLinkUrl.startsWith('MEDIA:')) {
return cleanLinkUrl; // Return the placeholder as-is
}
// Create a link with an image inside - don't escape brackets in URLs
// AsciiDoc can handle URLs with brackets if they're in the URL part
return `link:${cleanLinkUrl}[image:${cleanImgUrl}[${cleanAlt ? cleanAlt : 'link'}]]`;
});
// Convert images (but not nested ones, which we already processed)
// Match: ![alt text](url) or ![](url) - handle empty alt text
// Use negative lookbehind to avoid matching nested image links
// Format: image::url[alt,width=100%] - matching jumble's format
asciidoc = asciidoc.replace(/(?<!\[)!\[([^\]]*)\]\(([^)]+?)\)/g, (match, alt, url) => {
let processedUrl = url.trim();
const cleanAlt = alt.trim();
// Check if it's already a MEDIA: placeholder (processed by processMediaUrlsInMarkdown)
if (processedUrl.startsWith('MEDIA:')) {
return processedUrl; // Return the placeholder as-is
}
// Clean URL (remove tracking parameters)
processedUrl = cleanUrl(processedUrl);
// Regular image - match jumble's format: image::url[alt,width=100%]
// Don't escape brackets - AsciiDoc handles URLs properly
return `image::${processedUrl}[${cleanAlt ? cleanAlt + ',' : ''}width=100%]`;
});
// Convert anchor links: [text](#section-id) - these are internal links
asciidoc = asciidoc.replace(/(?<!!)\[([^\]]+)\]\(#([^)]+)\)/g, (match, text, anchor) => {
const cleanText = text.trim();
const cleanAnchor = anchor.trim();
// AsciiDoc uses # for anchor links, but we need to normalize the anchor ID
// Convert to lowercase and replace spaces/special chars with hyphens
const normalizedAnchor = cleanAnchor.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '');
const escapedText = cleanText.replace(/([\[\]])/g, '\\$1');
return `<<${normalizedAnchor},${escapedText}>>`;
});
// Convert links (but not images or anchor links, which we already processed)
// Match: [text](url) - use negative lookbehind to avoid matching images
// Use non-greedy matching for URL to stop at first closing paren
// This ensures we don't capture trailing punctuation
asciidoc = asciidoc.replace(/(?<!!)\[([^\]]+)\]\(([^)]+?)\)/g, (match, text, url) => {
let processedUrl = url.trim();
const cleanText = text.trim();
// Check if it's already a MEDIA: placeholder (processed by processMediaUrlsInMarkdown)
if (processedUrl.startsWith('MEDIA:')) {
return processedUrl; // Return the placeholder as-is
}
// Clean URL (remove tracking parameters)
processedUrl = cleanUrl(processedUrl);
// Handle WSS URLs: convert wss:// to https:// for display
if (processedUrl.startsWith('wss://')) {
processedUrl = processedUrl.replace(/^wss:\/\//, 'https://');
}
// Regular link - don't escape brackets in URLs (AsciiDoc handles them)
// Only escape brackets in the link text if needed
const escapedText = cleanText.replace(/([\[\]])/g, '\\$1');
return `link:${processedUrl}[${escapedText}]`;
});
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\'\'\'');
asciidoc = asciidoc.replace(/^\*\*\*$/gm, '\'\'\''); // Also handle ***
// Convert lists - need to process them as blocks to preserve structure
// First, convert task lists (before regular lists)
// Task lists: - [x] or - [ ] or * [x] or * [ ]
asciidoc = asciidoc.replace(/^(\s*)([-*])\s+\[([ x])\]\s+(.+)$/gm, (_match, indent, bullet, checked, text) => {
// Use AsciiDoc checkbox syntax: * [x] Task text
// The checkbox will be rendered by AsciiDoctor
return `${indent}* [${checked === 'x' ? 'x' : ' '}] ${text}`;
});
// Convert lists - process entire list blocks to ensure proper AsciiDoc formatting
// AsciiDoc lists need to be on their own lines with proper spacing
// Process lists in blocks to handle nested lists correctly
const lines = asciidoc.split('\n');
const processedLines = [];
let inList = false;
let listType = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const isEmpty = line.trim() === '';
const prevLine = i > 0 ? processedLines[processedLines.length - 1] : '';
const prevLineIsEmpty = prevLine.trim() === '';
// Check if this line is a list item (but not a task list, which we already processed)
const unorderedMatch = line.match(/^(\s*)([-*+])\s+(.+)$/);
const orderedMatch = line.match(/^(\s*)(\d+)\.\s+(.+)$/);
const isTaskList = line.match(/^(\s*)([-*])\s+\[([ x])\]\s+(.+)$/);
if (unorderedMatch && !isTaskList) {
const [, indent, , text] = unorderedMatch;
const indentLevel = indent.length;
// AsciiDoc uses 4 spaces per indentation level
// Markdown typically uses 2 or 4 spaces per level
// 2 spaces = 1 level (4 spaces), 4 spaces = 1 level (4 spaces)
const asciidocIndent = ' '.repeat(Math.ceil(indentLevel / 4));
// Add blank line before list if not already in a list
// But don't add blank line if we're switching list types within the same list context
if (!inList) {
// Starting a new list - add blank line if previous line has content
if (processedLines.length > 0 && !prevLineIsEmpty) {
processedLines.push('');
}
inList = true;
listType = 'unordered';
}
else if (listType !== 'unordered') {
// Switching list types - don't add blank line, just change type
listType = 'unordered';
}
processedLines.push(`${asciidocIndent}* ${text}`);
}
else if (orderedMatch) {
const [, indent, , text] = orderedMatch;
const indentLevel = indent.length;
// AsciiDoc uses 4 spaces per indentation level
// Markdown typically uses 2 or 4 spaces per level
// 2 spaces = 1 level (4 spaces), 4 spaces = 1 level (4 spaces)
const asciidocIndent = ' '.repeat(Math.ceil(indentLevel / 4));
// Add blank line before list if not already in a list
// But don't add blank line if we're switching list types within the same list context
if (!inList) {
// Starting a new list - add blank line if previous line has content
if (processedLines.length > 0 && !prevLineIsEmpty) {
processedLines.push('');
}
inList = true;
listType = 'ordered';
}
else if (listType !== 'ordered') {
// Switching list types - don't add blank line, just change type
listType = 'ordered';
}
processedLines.push(`${asciidocIndent}. ${text}`);
}
else {
// Not a list item
if (inList && !isEmpty) {
// End of list - add blank line after if the next line is not empty
if (i < lines.length - 1 && lines[i + 1].trim() !== '') {
processedLines.push('');
}
inList = false;
listType = null;
}
processedLines.push(line);
}
}
asciidoc = processedLines.join('\n');
// Convert blockquotes with attribution
asciidoc = asciidoc.replace(/^(>\s+.+(?:\n>\s+.+)*)/gm, (match) => {
const lines = match.split('\n').map(line => line.replace(/^>\s*/, ''));
let quoteBodyLines = [];
let attributionLine;
for (let i = lines.length - 1; i >= 0; i--) {
const line = lines[i].trim();
if (line.startsWith('—') || line.startsWith('--')) {
attributionLine = line;
quoteBodyLines = lines.slice(0, i);
break;
}
}
const quoteContent = quoteBodyLines.filter(l => l.trim() !== '').join('\n').trim();
if (attributionLine) {
let cleanedAttribution = attributionLine.replace(/^[—-]+/, '').trim();
let author = '';
let source = '';
const linkMatch = cleanedAttribution.match(/^(.*?),?\s*link:([^[\\]]+)\[([^\\]]+)\]$/);
if (linkMatch) {
author = linkMatch[1].trim();
source = `link:${linkMatch[2].trim()}[${linkMatch[3].trim()}]`;
}
else {
const parts = cleanedAttribution.split(',').map(p => p.trim());
author = parts[0];
if (parts.length > 1) {
source = parts.slice(1).join(', ').trim();
}
}
return `[quote, ${author}, ${source}]\n____\n${quoteContent}\n____`;
}
else {
return `____\n${quoteContent}\n____`;
}
});
// Convert tables with alignment support
asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|:]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => {
const lines = match.trim().split('\n').filter(line => line.trim());
if (lines.length < 2)
return match;
const headerRow = lines[0];
const separatorRow = lines[1];
const dataRows = lines.slice(2);
if (!separatorRow.includes('-'))
return match;
// Parse alignment from separator row
// :--- = left, :----: = center, ---: = right, --- = default
const cells = separatorRow.split('|').filter(c => c.trim());
const alignments = [];
cells.forEach((cell, index) => {
const trimmed = cell.trim();
if (trimmed.startsWith(':') && trimmed.endsWith(':')) {
alignments[index] = '^'; // center (AsciiDoc uses ^ for center)
}
else if (trimmed.endsWith(':')) {
alignments[index] = '>'; // right
}
else if (trimmed.startsWith(':')) {
alignments[index] = '<'; // left (explicit)
}
else {
alignments[index] = '<'; // default left
}
});
// Build cols attribute with alignments
const colsAttr = alignments.length > 0
? `[cols="${alignments.join(',')}"]`
: '';
let tableAsciidoc = colsAttr ? `${colsAttr}\n` : '';
tableAsciidoc += '|===\n';
tableAsciidoc += headerRow + '\n';
dataRows.forEach(row => {
tableAsciidoc += row + '\n';
});
tableAsciidoc += '|===';
return tableAsciidoc;
});
// Convert footnotes
const footnoteDefinitions = {};
let tempAsciidoc = asciidoc;
tempAsciidoc = tempAsciidoc.replace(/^\[\^([^\]]+)\]:\s*([\s\S]*?)(?=\n\[\^|\n---|\n##|\n###|\n####|\n#####|\n######|$)/gm, (_, id, text) => {
footnoteDefinitions[id] = text.trim();
return '';
});
asciidoc = tempAsciidoc.replace(/\[\^([^\]]+)\]/g, (match, id) => {
if (footnoteDefinitions[id]) {
return `footnote:[${footnoteDefinitions[id]}]`;
}
return match;
});
return asciidoc;
}
/**
* Converts plain text to AsciiDoc format
* Preserves line breaks by converting single newlines to line continuations
*/
function convertPlainTextToAsciidoc(content) {
// Preserve double newlines (paragraph breaks)
// Convert single newlines to line continuations ( +\n)
return content
.replace(/\r\n/g, '\n') // Normalize line endings
.replace(/\n\n+/g, '\n\n') // Normalize multiple newlines to double
.replace(/([^\n])\n([^\n])/g, '$1 +\n$2'); // Single newlines become line continuations
}
/**
* Normalizes text to d-tag format
*/
function normalizeDtag(text) {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Processes wikilinks: [[target]] or [[target|display text]]
* Converts to WIKILINK: placeholder format to protect from AsciiDoc processing
*/
function processWikilinks(content, linkBaseURL) {
// Process bookstr macro wikilinks: [[book::...]]
content = content.replace(/\[\[book::([^\]]+)\]\]/g, (_match, bookContent) => {
const cleanContent = bookContent.trim();
return `BOOKSTR:${cleanContent}`;
});
// Process standard wikilinks: [[Target Page]] or [[target page|see this]]
// Use placeholder format to prevent AsciiDoc from processing the brackets
content = content.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, displayText) => {
const cleanTarget = target.trim();
const cleanDisplay = displayText ? displayText.trim() : cleanTarget;
const dTag = normalizeDtag(cleanTarget);
// Use placeholder format: WIKILINK:dtag|display
// This prevents AsciiDoc from interpreting the brackets
return `WIKILINK:${dTag}|${cleanDisplay}`;
});
return content;
}
/**
* Processes nostr: addresses
* Only processes addresses with "nostr:" prefix - bare addresses are left as plaintext
* Converts to link:nostr:...[...] format
* Valid bech32 prefixes: npub, nprofile, nevent, naddr, note
*/
function processNostrAddresses(content, linkBaseURL) {
// Match nostr: followed by valid bech32 prefix and identifier
// Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers)
// Only match if it has "nostr:" prefix - bare addresses should remain as plaintext
const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi;
return content.replace(nostrPattern, (_match, bech32Id) => {
return `link:nostr:${bech32Id}[${bech32Id}]`;
});
}
/**
* Processes media URLs in markdown links and images
* Converts them to MEDIA: placeholders before markdown conversion
*/
function processMediaUrlsInMarkdown(content) {
let processed = content;
// Process YouTube URLs in markdown links: [text](youtube-url)
processed = processed.replace(/\[([^\]]+)\]\((?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:watch\?v=|embed\/|v\/)|youtu\.be\/)([a-zA-Z0-9_-]{11})(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, text, videoId) => {
return `MEDIA:youtube:${videoId}`;
});
// Process Spotify URLs in markdown links: [text](spotify-url)
processed = processed.replace(/\[([^\]]+)\]\((?:https?:\/\/)?(?:open\.)?spotify\.com\/(track|album|playlist|artist|episode|show)\/([a-zA-Z0-9]+)(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, text, type, id) => {
return `MEDIA:spotify:${type}:${id}`;
});
// Process video files in markdown links/images: [text](video-url) or ![alt](video-url)
processed = processed.replace(/[!]?\[([^\]]*)\]\((https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv))(?:\?[^\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, altOrText, url) => {
const cleanUrl = url.replace(/\?.*$/, ''); // Remove query params
return `MEDIA:video:${cleanUrl}`;
});
// Process audio files in markdown links/images: [text](audio-url) or ![alt](audio-url)
processed = processed.replace(/[!]?\[([^\]]*)\]\((https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp3|m4a|ogg|wav|flac|aac|opus|wma))(?:\?[^\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, altOrText, url) => {
const cleanUrl = url.replace(/\?.*$/, ''); // Remove query params
return `MEDIA:audio:${cleanUrl}`;
});
return processed;
}
/**
* Processes media URLs (YouTube, Spotify, video, audio files) in bare URLs
* Converts them to placeholders that will be rendered as embeds/players
*/
function processMediaUrls(content) {
// Process YouTube URLs
// Match: youtube.com/watch?v=, youtu.be/, youtube.com/embed/, youtube.com/v/
content = content.replace(/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:watch\?v=|embed\/|v\/)|youtu\.be\/)([a-zA-Z0-9_-]{11})(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?/gi, (match, videoId) => {
return `MEDIA:youtube:${videoId}`;
});
// Process Spotify URLs
// Match: open.spotify.com/track/, open.spotify.com/album/, open.spotify.com/playlist/, open.spotify.com/artist/
content = content.replace(/(?:https?:\/\/)?(?:open\.)?spotify\.com\/(track|album|playlist|artist|episode|show)\/([a-zA-Z0-9]+)(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?/gi, (match, type, id) => {
return `MEDIA:spotify:${type}:${id}`;
});
// Process video files (mp4, webm, ogg, m4v, mov, avi, etc.)
content = content.replace(/(?:https?:\/\/[^\s<>"{}|\\^`\[\]()]+)\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv)(?:\?[^\s<>"{}|\\^`\[\]()]*)?/gi, (match, ext) => {
const url = match.replace(/\?.*$/, ''); // Remove query params for cleaner URL
return `MEDIA:video:${url}`;
});
// Process audio files (mp3, m4a, ogg, wav, flac, aac, etc.)
content = content.replace(/(?:https?:\/\/[^\s<>"{}|\\^`\[\]()]+)\.(mp3|m4a|ogg|wav|flac|aac|opus|wma)(?:\?[^\s<>"{}|\\^`\[\]()]*)?/gi, (match, ext) => {
const url = match.replace(/\?.*$/, ''); // Remove query params for cleaner URL
return `MEDIA:audio:${url}`;
});
return content;
}
/**
* Processes bare URLs and converts them to AsciiDoc links
* Matches http://, https://, wss://, and www. URLs that aren't already in markdown links
* Also handles bare image URLs (converts to images)
* Skips URLs inside code blocks (---- blocks) and inline code (backticks)
*/
function processBareUrls(content) {
// Protect code blocks and inline code from URL processing
// We'll process URLs, then restore code blocks
const codeBlockPlaceholders = [];
const inlineCodePlaceholders = [];
// Replace code blocks with placeholders
content = content.replace(/\[source[^\]]*\]\n----\n([\s\S]*?)\n----/g, (match, code) => {
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Also handle plain code blocks (without [source])
content = content.replace(/----\n([\s\S]*?)\n----/g, (match, code) => {
// Check if this is already a placeholder
if (match.includes('__CODEBLOCK_')) {
return match;
}
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Replace inline code with placeholders
content = content.replace(/`([^`]+)`/g, (match, code) => {
const placeholder = `__INLINECODE_${inlineCodePlaceholders.length}__`;
inlineCodePlaceholders.push(match);
return placeholder;
});
// First, handle bare image URLs (before regular URLs)
// Match image URLs: .jpg, .png, .gif, .webp, .svg, etc.
// Format: image::url[width=100%] - matching jumble's format
const imageUrlPattern = /(?<!\]\()\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(jpe?g|png|gif|webp|svg|bmp|ico))(?:\?[^\s<>"{}|\\^`\[\]()]*)?/gi;
content = content.replace(imageUrlPattern, (match, url) => {
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Don't escape brackets - AsciiDoc handles URLs properly
return `image::${cleanedUrl}[width=100%]`;
});
// Match URLs that aren't already in markdown link format
// Pattern: http://, https://, wss://, or www. followed by valid URL characters
// Use word boundary to avoid matching URLs that are part of other text
// Don't match if immediately after colon-space (like "hyperlink: www.example.com")
const urlPattern = /(?<!\]\()(?<!:\s)\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+|wss:\/\/[^\s<>"{}|\\^`\[\]()]+|www\.[^\s<>"{}|\\^`\[\]()]+)/gi;
content = content.replace(urlPattern, (match, url) => {
// Skip if this URL was already converted to an image
if (match.includes('image::')) {
return match;
}
// Ensure URL starts with http:// or https://
let fullUrl = url;
if (url.startsWith('www.')) {
fullUrl = 'https://' + url;
}
else if (url.startsWith('wss://')) {
// Convert wss:// to https:// for display
fullUrl = url.replace(/^wss:\/\//, 'https://');
}
// Clean URL (remove tracking parameters)
fullUrl = cleanUrl(fullUrl);
// Don't escape brackets in URLs - AsciiDoc handles them properly
// The URL is in the link: part, brackets in URLs are valid
// Use proper AsciiDoc link syntax: link:url[text]
return `link:${fullUrl}[${url}]`;
});
// Restore inline code
inlineCodePlaceholders.forEach((code, index) => {
content = content.replace(`__INLINECODE_${index}__`, code);
});
// Restore code blocks
codeBlockPlaceholders.forEach((code, index) => {
content = content.replace(`__CODEBLOCK_${index}__`, code);
});
return content;
}
/**
* Processes hashtags
* Converts to hashtag:tag[#tag] format
* Handles hashtags at the beginning of lines to prevent line breaks
*/
function processHashtags(content) {
// Match # followed by word characters
// Match at word boundary OR at start of line OR after whitespace
// This ensures we don't match # in URLs or code, but do match at line start
return content.replace(/(^|\s|>)#([a-zA-Z0-9_]+)(?![a-zA-Z0-9_])/g, (match, before, hashtag) => {
const normalizedHashtag = hashtag.toLowerCase();
// Preserve the space or line start before the hashtag to prevent line breaks
// Add a zero-width space or ensure proper spacing
const prefix = before === '' ? '' : before;
return `${prefix}hashtag:${normalizedHashtag}[#${hashtag}]`;
});
}

330
src/converters/to-asciidoc.ts

@ -1,330 +0,0 @@ @@ -1,330 +0,0 @@
import { ContentFormat } from '../types';
export interface ConvertOptions {
enableNostrAddresses?: boolean;
}
/**
* Converts content from various formats (Markdown, Wikipedia, Plain) to AsciiDoc
*
* Processing order:
* 1. Convert special syntax (wikilinks, hashtags, nostr links) to placeholders
* 2. Process media URLs (YouTube, Spotify, video, audio)
* 3. Process images (Markdown and bare URLs)
* 4. Process links (Markdown and bare URLs)
* 5. Clean URLs (remove tracking parameters)
*/
export function convertToAsciidoc(
content: string,
format: ContentFormat,
linkBaseURL?: string,
options: ConvertOptions = {}
): string {
let processed = content;
// Step 1: Convert special syntax to placeholders (before other processing)
processed = convertWikilinks(processed);
processed = convertHashtags(processed);
if (options.enableNostrAddresses !== false) {
processed = convertNostrLinks(processed);
}
// Step 2: Process media URLs (before link processing to avoid conflicts)
processed = processMediaUrls(processed);
// Step 3: Process images (before links to avoid conflicts)
processed = processImages(processed, format);
// Step 4: Process links (Markdown and bare URLs)
processed = processLinks(processed, format);
// Step 5: Convert format-specific syntax
if (format === ContentFormat.Markdown) {
processed = convertMarkdownToAsciidoc(processed);
} else if (format === ContentFormat.Wikipedia) {
processed = convertWikipediaToAsciidoc(processed);
}
return processed;
}
/**
* Convert wikilinks [[target]] or [[target|display]] to WIKILINK:dtag|display
*/
function convertWikilinks(content: string): string {
return content.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, display) => {
const dtag = normalizeDtag(target.trim());
const displayText = display ? display.trim() : target.trim();
return `WIKILINK:${dtag}|${displayText}`;
});
}
/**
* Normalize dtag (lowercase, replace spaces with hyphens)
*/
function normalizeDtag(dtag: string): string {
return dtag.toLowerCase().replace(/\s+/g, '-');
}
/**
* Convert hashtags #topic to hashtag:topic[topic]
* Skip hashtags in URLs, code blocks, and inline code
*/
function convertHashtags(content: string): string {
// Protect code blocks
const codeBlocks: string[] = [];
content = content.replace(/```[\s\S]*?```/g, (match) => {
const placeholder = `__CODEBLOCK_${codeBlocks.length}__`;
codeBlocks.push(match);
return placeholder;
});
// Protect inline code
const inlineCode: string[] = [];
content = content.replace(/`[^`]+`/g, (match) => {
const placeholder = `__INLINECODE_${inlineCode.length}__`;
inlineCode.push(match);
return placeholder;
});
// Convert hashtags (not in URLs)
content = content.replace(/(?<!https?:\/\/[^\s]*)#([a-zA-Z0-9_]+)/g, (_match, topic) => {
const normalized = topic.toLowerCase();
return `hashtag:${normalized}[#${topic}]`;
});
// Restore inline code
inlineCode.forEach((code, index) => {
content = content.replace(`__INLINECODE_${index}__`, code);
});
// Restore code blocks
codeBlocks.forEach((block, index) => {
content = content.replace(`__CODEBLOCK_${index}__`, block);
});
return content;
}
/**
* Convert nostr: links to link:nostr:...[...]
*/
function convertNostrLinks(content: string): string {
// Match nostr:npub1..., nostr:note1..., etc.
return content.replace(/nostr:([a-z0-9]+[a-z0-9]{50,})/gi, (match, bech32Id) => {
// Extract display text (first few chars)
const display = bech32Id.substring(0, 8) + '...';
return `link:nostr:${bech32Id}[${display}]`;
});
}
/**
* Process media URLs and convert to MEDIA: placeholders
*/
function processMediaUrls(content: string): string {
let processed = content;
// YouTube URLs
processed = processed.replace(
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]+)/g,
(_match, videoId) => `MEDIA:youtube:${videoId}`
);
// Spotify URLs
processed = processed.replace(
/(?:https?:\/\/)?(?:open\.)?spotify\.com\/(track|album|playlist|artist|episode|show)\/([a-zA-Z0-9]+)/g,
(_match, type, id) => `MEDIA:spotify:${type}:${id}`
);
// Video files
processed = processed.replace(
/(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv))/gi,
(_match, url) => `MEDIA:video:${url}`
);
// Audio files
processed = processed.replace(
/(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp3|m4a|wav|flac|aac|opus|wma|ogg))/gi,
(_match, url) => `MEDIA:audio:${url}`
);
return processed;
}
/**
* Process images (Markdown syntax and bare URLs)
*/
function processImages(content: string, format: ContentFormat): string {
let processed = content;
// Markdown image syntax: ![alt](url)
processed = processed.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const cleanedUrl = cleanUrl(url);
const cleanAlt = alt.trim();
return `image::${cleanedUrl}[${cleanAlt ? cleanAlt + ',' : ''}width=100%]`;
});
// Bare image URLs (only if not already in a link or image tag)
if (format === ContentFormat.Markdown || format === ContentFormat.Plain) {
const imageUrlPattern = /(?<!\]\()(?<!image::)(?<!link:)(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(jpeg|jpg|png|gif|webp|svg))/gi;
processed = processed.replace(imageUrlPattern, (match, url) => {
const cleanedUrl = cleanUrl(url);
return `image::${cleanedUrl}[width=100%]`;
});
}
return processed;
}
/**
* Process links (Markdown syntax and bare URLs)
*/
function processLinks(content: string, format: ContentFormat): string {
let processed = content;
// Markdown link syntax: [text](url)
processed = processed.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
// Skip if this is already processed as an image
if (text.startsWith('!')) {
return _match;
}
const cleanedUrl = cleanUrl(url);
return `link:${cleanedUrl}[${text}]`;
});
// Bare URLs (only for Markdown and Plain formats)
if (format === ContentFormat.Markdown || format === ContentFormat.Plain) {
processed = processBareUrls(processed);
}
return processed;
}
/**
* Process bare URLs and convert to link: macros
* Handles http://, https://, www., and wss:// URLs
*/
function processBareUrls(content: string): string {
// URL pattern: matches http://, https://, www., and wss://
// Negative lookbehind to avoid matching URLs after ":" (e.g., "hyperlink: www.example.com")
const urlPattern = /(?<!:\s)(?<!\]\()\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+|wss:\/\/[^\s<>"{}|\\^`\[\]()]+|www\.[^\s<>"{}|\\^`\[\]()]+)/gi;
return content.replace(urlPattern, (match, url) => {
// Skip if already in a link or image macro
if (match.includes('link:') || match.includes('image::')) {
return match;
}
let fullUrl = url;
let displayText = url;
// Handle www. URLs
if (url.startsWith('www.')) {
fullUrl = 'https://' + url;
displayText = url;
}
// Handle wss:// URLs - convert to https:// for the link, but keep wss:// in display
else if (url.startsWith('wss://')) {
fullUrl = url.replace(/^wss:\/\//, 'https://');
displayText = url; // Keep wss:// in display text
}
// Clean the URL (remove tracking parameters)
fullUrl = cleanUrl(fullUrl);
// Create AsciiDoc link macro
return `link:${fullUrl}[${displayText}]`;
});
}
/**
* Clean URL by removing tracking parameters
*/
function cleanUrl(url: string): string {
try {
const parsedUrl = new URL(url);
// List of tracking parameters to remove
const trackingParams = [
// Google Analytics & Ads
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
// Facebook
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
// Twitter/X
'twclid', 'twsrc',
// Microsoft/Bing
'msclkid', 'mc_cid', 'mc_eid',
// Adobe
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
// Mailchimp
'mc_cid', 'mc_eid',
// HubSpot
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
// Marketo
'mkt_tok',
// YouTube
'si', 'feature', 'kw', 'pp',
// Other common tracking
'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
// Mobile app tracking
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
// Amazon
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
// Affiliate tracking
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
// Social media share tracking
'share', 'shared', 'sharesource'
];
// Remove all tracking parameters
trackingParams.forEach(param => {
parsedUrl.searchParams.delete(param);
});
// Remove any parameter that starts with utm_ or _
Array.from(parsedUrl.searchParams.keys()).forEach(key => {
if (key.startsWith('utm_') || key.startsWith('_')) {
parsedUrl.searchParams.delete(key);
}
});
return parsedUrl.toString();
} catch {
// If URL parsing fails, return original URL
return url;
}
}
/**
* Convert Markdown-specific syntax to AsciiDoc
*/
function convertMarkdownToAsciidoc(content: string): string {
// Most Markdown syntax is handled by AsciiDoctor's markdown support
// This function can be extended for additional conversions if needed
return content;
}
/**
* Convert Wikipedia-specific syntax to AsciiDoc
*/
function convertWikipediaToAsciidoc(content: string): string {
// Wikipedia-specific conversions can be added here
return content;
}

70
src/detector.js

@ -1,70 +0,0 @@ @@ -1,70 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.detectFormat = detectFormat;
const types_1 = require("./types");
/**
* Detects the content format based on content patterns
*/
function detectFormat(content) {
// Check for AsciiDoc indicators
const asciidocIndicators = [
'= ', // Title
'== ', // Section
'=== ', // Subsection
'include::', // Include directive
'image::', // Image block
'[source', // Source block
'----', // Listing block
'....', // Literal block
'|===', // Table
'link:', // AsciiDoc link format
'wikilink:', // Wikilink macro
'hashtag:', // Hashtag macro
];
let asciidocScore = 0;
for (const indicator of asciidocIndicators) {
if (content.includes(indicator)) {
asciidocScore++;
}
}
// Check for Wikipedia markup indicators (== Heading == format)
const wikipediaIndicators = [
/^==+\s+.+?\s+==+$/m, // Wikipedia headings: == Heading ==
/\[\[[^\]]+\]\]/, // Wikipedia links: [[Page]]
/''[^']+''/, // Wikipedia bold: ''text''
/'[^']+'/, // Wikipedia italic: 'text'
];
let wikipediaScore = 0;
for (const indicator of wikipediaIndicators) {
if (indicator.test(content)) {
wikipediaScore++;
}
}
// Check for Markdown indicators (more specific patterns to avoid false positives)
const markdownIndicators = [
/^#{1,6}\s+/m, // Heading at start of line
/```[\s\S]*?```/, // Code block
/\*\*[^*]+\*\*/, // Bold text
/^[-*+]\s+/m, // List item at start of line
/!\[[^\]]*\]\([^)]+\)/, // Image syntax
/\[[^\]]+\]\([^)]+\)/, // Link syntax
];
let markdownScore = 0;
for (const indicator of markdownIndicators) {
if (indicator.test(content)) {
markdownScore++;
}
}
// Determine format based on scores
// Wikipedia format takes precedence if detected (it's more specific)
if (wikipediaScore > 0 && wikipediaScore >= 2) {
return types_1.ContentFormat.Wikipedia;
}
else if (asciidocScore > markdownScore && asciidocScore >= 2) {
return types_1.ContentFormat.AsciiDoc;
}
else if (markdownScore > 0) {
return types_1.ContentFormat.Markdown;
}
return types_1.ContentFormat.Plain;
}

101
src/detector.ts

@ -4,70 +4,55 @@ import { ContentFormat } from './types'; @@ -4,70 +4,55 @@ import { ContentFormat } from './types';
* Detects the content format based on content patterns
*/
export function detectFormat(content: string): ContentFormat {
// Check for AsciiDoc indicators
const asciidocIndicators = [
'= ', // Title
'== ', // Section
'=== ', // Subsection
'include::', // Include directive
'image::', // Image block
'[source', // Source block
'----', // Listing block
'....', // Literal block
'|===', // Table
'link:', // AsciiDoc link format
'wikilink:', // Wikilink macro
'hashtag:', // Hashtag macro
];
let asciidocScore = 0;
for (const indicator of asciidocIndicators) {
if (content.includes(indicator)) {
asciidocScore++;
}
}
// Check for Wikipedia markup indicators (== Heading == format)
const wikipediaIndicators = [
/^==+\s+.+?\s+==+$/m, // Wikipedia headings: == Heading ==
/\[\[[^\]]+\]\]/, // Wikipedia links: [[Page]]
/''[^']+''/, // Wikipedia bold: ''text''
/'[^']+'/, // Wikipedia italic: 'text'
];
let wikipediaScore = 0;
for (const indicator of wikipediaIndicators) {
if (indicator.test(content)) {
wikipediaScore++;
}
if (!content || content.trim().length === 0) {
return ContentFormat.Plain;
}
// Check for Markdown indicators (more specific patterns to avoid false positives)
const markdownIndicators = [
/^#{1,6}\s+/m, // Heading at start of line
/```[\s\S]*?```/, // Code block
/\*\*[^*]+\*\*/, // Bold text
/^[-*+]\s+/m, // List item at start of line
/!\[[^\]]*\]\([^)]+\)/, // Image syntax
/\[[^\]]+\]\([^)]+\)/, // Link syntax
];
const trimmed = content.trim();
let markdownScore = 0;
for (const indicator of markdownIndicators) {
if (indicator.test(content)) {
markdownScore++;
}
// Check for AsciiDoc indicators
// - Document title: = Title
// - Section headers: ==, ===, etc.
// - AsciiDoc attributes: :attribute: value
// - AsciiDoc blocks: [source,lang], [abc], [plantuml]
// - AsciiDoc macros: image::, video::, audio::, link:
if (
/^=+\s+/.test(trimmed) ||
/^:[\w-]+:/.test(trimmed) ||
/\[source,[\w-]+\]/.test(content) ||
/\[abc\]/.test(content) ||
/\[plantuml\]/.test(content) ||
/image::/.test(content) ||
/video::/.test(content) ||
/audio::/.test(content) ||
/link:/.test(content) ||
/\[cols=/.test(content) ||
/\|\|===/.test(content) ||
/footnote:\[/.test(content) ||
/\[highlight\]/.test(content) ||
/\[line-through\]/.test(content) ||
/\[quote\]/.test(content)
) {
return ContentFormat.AsciiDoc;
}
// Determine format based on scores
// Wikipedia format takes precedence if detected (it's more specific)
if (wikipediaScore > 0 && wikipediaScore >= 2) {
return ContentFormat.Wikipedia;
} else if (asciidocScore > markdownScore && asciidocScore >= 2) {
return ContentFormat.AsciiDoc;
} else if (markdownScore > 0) {
// Check for Markdown indicators
// - YAML frontmatter: --- at start
// - Markdown headers: #, ##, etc.
// - Markdown code blocks: ```lang
// - Markdown links: [text](url)
// - Markdown images: ![alt](url)
if (
/^---\s*$/.test(trimmed.split('\n')[0]) ||
/^#{1,6}\s+/.test(trimmed) ||
/^```[\w-]*/.test(trimmed) ||
/\[.*?\]\(.*?\)/.test(content) ||
/!\[.*?\]\(.*?\)/.test(content) ||
/^\|\s*\|/.test(trimmed) ||
/^>\s+/.test(trimmed)
) {
return ContentFormat.Markdown;
}
return ContentFormat.Plain;
return ContentFormat.Unknown;
}

160
src/extractors/frontmatter.js

@ -1,160 +0,0 @@ @@ -1,160 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractFrontmatter = extractFrontmatter;
/**
* Extracts front matter from content
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value)
* Returns the front matter object and the content
* For YAML: removes front matter from content
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output)
*/
function extractFrontmatter(content) {
// First, try to match YAML front matter: ---\n...\n---
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/;
const yamlMatch = content.match(yamlFrontmatterRegex);
if (yamlMatch) {
const yamlContent = yamlMatch[1];
const contentWithoutFrontmatter = yamlMatch[2];
// Simple YAML parser for basic key-value pairs and arrays
// This is a basic implementation - for complex YAML, consider using a library
const frontmatter = {};
const lines = yamlContent.split('\n');
let currentKey = null;
let inArray = false;
let arrayKey = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmed = line.trim();
// Skip empty lines and comments
if (!trimmed || trimmed.startsWith('#')) {
if (inArray && trimmed === '') {
// Empty line might end the array
inArray = false;
arrayKey = null;
}
continue;
}
// Array item (line starting with -)
if (trimmed.startsWith('- ')) {
const item = trimmed.substring(2).trim();
const cleanItem = item.replace(/^["']|["']$/g, '');
if (arrayKey && frontmatter[arrayKey]) {
frontmatter[arrayKey].push(cleanItem);
}
else if (currentKey) {
// Start new array
arrayKey = currentKey;
inArray = true;
frontmatter[currentKey] = [cleanItem];
}
continue;
}
// Key-value pair
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/);
if (keyValueMatch) {
const key = keyValueMatch[1];
let value = keyValueMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
frontmatter[key] = value;
currentKey = key;
inArray = false;
arrayKey = null;
continue;
}
}
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter };
}
// If no YAML front matter, try to extract AsciiDoc document header attributes
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
// Match header lines until we hit a blank line (which separates header from body)
// The header consists of: title line, optional author/revision lines, and attribute lines
const lines = content.split('\n');
let headerEndIndex = 0;
// Find where the header ends (first blank line after title/attributes)
if (lines[0] && lines[0].match(/^=+\s+/)) {
// We have a title line, now find where header ends
let i = 1;
// Skip author and revision lines (non-empty lines that don't start with :)
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) {
i++;
}
// Now skip attribute lines (lines starting with :)
while (i < lines.length && lines[i].trim().startsWith(':')) {
i++;
}
// Skip the blank line that separates header from body
if (i < lines.length && lines[i].trim() === '') {
i++;
}
headerEndIndex = i;
}
// If we found a header, extract it
if (headerEndIndex > 0) {
const headerLines = lines.slice(0, headerEndIndex);
const headerContent = headerLines.join('\n');
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n');
const frontmatter = {};
const headerLinesArray = headerContent.split('\n');
// Extract title (first line starting with =)
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/);
if (titleMatch) {
frontmatter.title = titleMatch[1].trim();
}
// Extract author (line after title, if it doesn't start with :)
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) {
const authorLine = headerLinesArray[1].trim();
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) {
// Not a revision line (which has numbers, commas, colons)
frontmatter.author = authorLine;
}
}
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
for (let i = 1; i < headerLinesArray.length; i++) {
const line = headerLinesArray[i].trim();
if (line.match(/^[\d.,\s:]+$/)) {
// This looks like a revision line
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/);
if (revisionMatch) {
frontmatter.version = revisionMatch[1].trim();
frontmatter.date = revisionMatch[2].trim();
if (revisionMatch[3]) {
frontmatter.revision = revisionMatch[3].trim();
}
}
break;
}
}
// Extract AsciiDoc attributes (:key: value)
for (const line of headerLinesArray) {
const trimmed = line.trim();
if (trimmed.startsWith(':') && trimmed.includes(':')) {
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/);
if (attrMatch) {
const key = attrMatch[1].trim();
let value = attrMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
// Handle comma-separated values (like keywords)
if (value.includes(',') && !value.includes(' ')) {
frontmatter[key] = value.split(',').map((v) => v.trim());
}
else {
frontmatter[key] = value;
}
}
}
}
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
// AsciiDoctor can work without the header, and we've already extracted the metadata
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader };
}
// No front matter found
return { content };
}

177
src/extractors/frontmatter.ts

@ -1,177 +0,0 @@ @@ -1,177 +0,0 @@
/**
* Extracts front matter from content
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value)
* Returns the front matter object and the content
* For YAML: removes front matter from content
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output)
*/
export function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } {
// First, try to match YAML front matter: ---\n...\n---
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/;
const yamlMatch = content.match(yamlFrontmatterRegex);
if (yamlMatch) {
const yamlContent = yamlMatch[1];
const contentWithoutFrontmatter = yamlMatch[2];
// Simple YAML parser for basic key-value pairs and arrays
// This is a basic implementation - for complex YAML, consider using a library
const frontmatter: Record<string, any> = {};
const lines = yamlContent.split('\n');
let currentKey: string | null = null;
let inArray = false;
let arrayKey: string | null = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmed = line.trim();
// Skip empty lines and comments
if (!trimmed || trimmed.startsWith('#')) {
if (inArray && trimmed === '') {
// Empty line might end the array
inArray = false;
arrayKey = null;
}
continue;
}
// Array item (line starting with -)
if (trimmed.startsWith('- ')) {
const item = trimmed.substring(2).trim();
const cleanItem = item.replace(/^["']|["']$/g, '');
if (arrayKey && frontmatter[arrayKey]) {
frontmatter[arrayKey].push(cleanItem);
} else if (currentKey) {
// Start new array
arrayKey = currentKey;
inArray = true;
frontmatter[currentKey] = [cleanItem];
}
continue;
}
// Key-value pair
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/);
if (keyValueMatch) {
const key = keyValueMatch[1];
let value = keyValueMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
frontmatter[key] = value;
currentKey = key;
inArray = false;
arrayKey = null;
continue;
}
}
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter };
}
// If no YAML front matter, try to extract AsciiDoc document header attributes
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
// Match header lines until we hit a blank line (which separates header from body)
// The header consists of: title line, optional author/revision lines, and attribute lines
const lines = content.split('\n');
let headerEndIndex = 0;
// Find where the header ends (first blank line after title/attributes)
if (lines[0] && lines[0].match(/^=+\s+/)) {
// We have a title line, now find where header ends
let i = 1;
// Skip author and revision lines (non-empty lines that don't start with :)
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) {
i++;
}
// Now skip attribute lines (lines starting with :)
while (i < lines.length && lines[i].trim().startsWith(':')) {
i++;
}
// Skip the blank line that separates header from body
if (i < lines.length && lines[i].trim() === '') {
i++;
}
headerEndIndex = i;
}
// If we found a header, extract it
if (headerEndIndex > 0) {
const headerLines = lines.slice(0, headerEndIndex);
const headerContent = headerLines.join('\n');
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n');
const frontmatter: Record<string, any> = {};
const headerLinesArray = headerContent.split('\n');
// Extract title (first line starting with =)
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/);
if (titleMatch) {
frontmatter.title = titleMatch[1].trim();
}
// Extract author (line after title, if it doesn't start with :)
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) {
const authorLine = headerLinesArray[1].trim();
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) {
// Not a revision line (which has numbers, commas, colons)
frontmatter.author = authorLine;
}
}
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
for (let i = 1; i < headerLinesArray.length; i++) {
const line = headerLinesArray[i].trim();
if (line.match(/^[\d.,\s:]+$/)) {
// This looks like a revision line
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/);
if (revisionMatch) {
frontmatter.version = revisionMatch[1].trim();
frontmatter.date = revisionMatch[2].trim();
if (revisionMatch[3]) {
frontmatter.revision = revisionMatch[3].trim();
}
}
break;
}
}
// Extract AsciiDoc attributes (:key: value)
for (const line of headerLinesArray) {
const trimmed = line.trim();
if (trimmed.startsWith(':') && trimmed.includes(':')) {
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/);
if (attrMatch) {
const key = attrMatch[1].trim();
let value = attrMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
// Handle comma-separated values (like keywords)
if (value.includes(',') && !value.includes(' ')) {
frontmatter[key] = value.split(',').map((v: string) => v.trim());
} else {
frontmatter[key] = value;
}
}
}
}
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
// AsciiDoctor can work without the header, and we've already extracted the metadata
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader };
}
// No front matter found
return { content };
}

243
src/extractors/metadata.js

@ -1,243 +0,0 @@ @@ -1,243 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractMetadata = extractMetadata;
/**
* Extracts metadata from content before processing
*/
function extractMetadata(content, linkBaseURL) {
return {
nostrLinks: extractNostrLinks(content),
wikilinks: extractWikilinks(content),
hashtags: extractHashtags(content),
links: extractLinks(content, linkBaseURL),
media: extractMedia(content),
};
}
/**
* Extract Nostr links from content
*/
function extractNostrLinks(content) {
const nostrLinks = [];
const seen = new Set();
// Extract nostr: prefixed links (valid bech32 format)
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || [];
nostrMatches.forEach(match => {
const id = match.substring(6); // Remove 'nostr:'
const type = getNostrType(id);
if (type && !seen.has(id)) {
seen.add(id);
nostrLinks.push({
type,
id,
text: match,
bech32: id,
});
}
});
return nostrLinks;
}
/**
* Extract wikilinks from content
*/
function extractWikilinks(content) {
const wikilinks = [];
const seen = new Set();
// Match [[target]] or [[target|display]]
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g;
let match;
while ((match = wikilinkPattern.exec(content)) !== null) {
const target = match[1].trim();
const display = match[2] ? match[2].trim() : target;
const dtag = normalizeDtag(target);
const key = `${dtag}|${display}`;
if (!seen.has(key)) {
seen.add(key);
wikilinks.push({
dtag,
display,
original: match[0],
});
}
}
return wikilinks;
}
/**
* Extract hashtags from content
* Excludes hashtags in URLs, code blocks, and inline code
*/
function extractHashtags(content) {
const hashtags = [];
const seen = new Set();
// Remove code blocks first to avoid matching inside them
const codeBlockPattern = /```[\s\S]*?```/g;
const inlineCodePattern = /`[^`]+`/g;
const urlPattern = /https?:\/\/[^\s<>"']+/g;
let processedContent = content
.replace(codeBlockPattern, '') // Remove code blocks
.replace(inlineCodePattern, '') // Remove inline code
.replace(urlPattern, ''); // Remove URLs
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g;
let match;
while ((match = hashtagPattern.exec(processedContent)) !== null) {
const tag = match[1].toLowerCase();
if (!seen.has(tag)) {
hashtags.push(tag);
seen.add(tag);
}
}
return hashtags;
}
/**
* Extract regular links from content
*/
function extractLinks(content, linkBaseURL) {
const links = [];
const seen = new Set();
// Extract markdown links: [text](url) - optimized to avoid double matching
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
let markdownMatch;
while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) {
const [, text, url] = markdownMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g;
let asciidocMatch;
while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) {
const [, url, text] = asciidocMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract raw URLs (basic pattern)
const urlPattern = /https?:\/\/[^\s<>"']+/g;
const rawUrls = content.match(urlPattern) || [];
rawUrls.forEach(url => {
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text: url,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
});
return links;
}
/**
* Extract media URLs from content
*/
function extractMedia(content) {
const media = [];
const seen = new Set();
// Extract markdown images: ![alt](url) - optimized to avoid double matching
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
let markdownImageMatch;
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) {
const url = markdownImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
const asciidocImagePattern = /image::([^\[]+)\[/g;
let asciidocImageMatch;
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) {
const url = asciidocImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract raw image/video URLs
const urlPattern = /https?:\/\/[^\s<>"']+/g;
const rawUrls = content.match(urlPattern) || [];
rawUrls.forEach(url => {
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) {
media.push(url);
seen.add(url);
}
});
return media;
}
/**
* Get Nostr identifier type
*/
function getNostrType(id) {
if (id.startsWith('npub'))
return 'npub';
if (id.startsWith('nprofile'))
return 'nprofile';
if (id.startsWith('nevent'))
return 'nevent';
if (id.startsWith('naddr'))
return 'naddr';
if (id.startsWith('note'))
return 'note';
return null;
}
/**
* Normalize text to d-tag format
*/
function normalizeDtag(text) {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Check if URL is external
*/
function isExternalUrl(url, linkBaseURL) {
if (!linkBaseURL)
return true;
try {
// Use a simple string-based check for Node.js compatibility
// Extract hostname from URL string
const urlMatch = url.match(/^https?:\/\/([^\/]+)/);
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch && baseMatch) {
return urlMatch[1] !== baseMatch[1];
}
return true;
}
catch {
return true;
}
}
/**
* Check if URL is a Nostr URL
*/
function isNostrUrl(url) {
return url.startsWith('nostr:') || getNostrType(url) !== null;
}
/**
* Check if URL is an image
*/
function isImageUrl(url) {
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url);
}
/**
* Check if URL is a video
*/
function isVideoUrl(url) {
return /\.(mp4|webm|ogg)$/i.test(url);
}

396
src/extractors/metadata.ts

@ -1,396 +0,0 @@ @@ -1,396 +0,0 @@
import { NostrLink, Wikilink } from '../types';
export interface ExtractedMetadata {
nostrLinks: NostrLink[];
wikilinks: Wikilink[];
hashtags: string[];
links: Array<{ url: string; text: string; isExternal: boolean }>;
media: string[];
}
/**
* Extracts metadata from content before processing
*/
export function extractMetadata(content: string, linkBaseURL: string): ExtractedMetadata {
return {
nostrLinks: extractNostrLinks(content),
wikilinks: extractWikilinks(content),
hashtags: extractHashtags(content),
links: extractLinks(content, linkBaseURL),
media: extractMedia(content),
};
}
/**
* Extract Nostr links from content
*/
function extractNostrLinks(content: string): NostrLink[] {
const nostrLinks: NostrLink[] = [];
const seen = new Set<string>();
// Extract nostr: prefixed links (valid bech32 format)
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || [];
nostrMatches.forEach(match => {
const id = match.substring(6); // Remove 'nostr:'
const type = getNostrType(id);
if (type && !seen.has(id)) {
seen.add(id);
nostrLinks.push({
type,
id,
text: match,
bech32: id,
});
}
});
return nostrLinks;
}
/**
* Extract wikilinks from content
*/
function extractWikilinks(content: string): Wikilink[] {
const wikilinks: Wikilink[] = [];
const seen = new Set<string>();
// Match [[target]] or [[target|display]]
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g;
let match;
while ((match = wikilinkPattern.exec(content)) !== null) {
const target = match[1].trim();
const display = match[2] ? match[2].trim() : target;
const dtag = normalizeDtag(target);
const key = `${dtag}|${display}`;
if (!seen.has(key)) {
seen.add(key);
wikilinks.push({
dtag,
display,
original: match[0],
});
}
}
return wikilinks;
}
/**
* Extract hashtags from content
* Excludes hashtags in URLs, code blocks, and inline code
*/
function extractHashtags(content: string): string[] {
const hashtags: string[] = [];
const seen = new Set<string>();
// Remove code blocks first to avoid matching inside them
const codeBlockPattern = /```[\s\S]*?```/g;
const inlineCodePattern = /`[^`]+`/g;
const urlPattern = /https?:\/\/[^\s<>"']+/g;
let processedContent = content
.replace(codeBlockPattern, '') // Remove code blocks
.replace(inlineCodePattern, '') // Remove inline code
.replace(urlPattern, ''); // Remove URLs
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g;
let match;
while ((match = hashtagPattern.exec(processedContent)) !== null) {
const tag = match[1].toLowerCase();
if (!seen.has(tag)) {
hashtags.push(tag);
seen.add(tag);
}
}
return hashtags;
}
/**
* Extract regular links from content
*/
function extractLinks(content: string, linkBaseURL: string): Array<{ url: string; text: string; isExternal: boolean }> {
const links: Array<{ url: string; text: string; isExternal: boolean }> = [];
const seen = new Set<string>();
// Remove code blocks and inline code to avoid matching URLs inside them
const codeBlockPattern = /```[\s\S]*?```/g;
const inlineCodePattern = /`[^`]+`/g;
let processedContent = content
.replace(codeBlockPattern, '') // Remove code blocks
.replace(inlineCodePattern, ''); // Remove inline code
// Extract markdown links: [text](url) - but NOT images ![alt](url)
// First, extract nested image links: [![alt](image-url)](link-url)
// These should extract the outer link with the alt text
// We also need to mark the inner image URL as seen so it doesn't get extracted as a raw URL
const nestedImageLinkPattern = /\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)/g;
let nestedMatch;
const nestedImageUrls = new Set<string>(); // Track inner image URLs to exclude them
while ((nestedMatch = nestedImageLinkPattern.exec(processedContent)) !== null) {
const [, altText, imageUrl, linkUrl] = nestedMatch;
const cleanLinkUrl = linkUrl.trim().replace(/[)\].,;:!?`]+$/, '');
const cleanImageUrl = imageUrl.trim().replace(/[)\].,;:!?`]+$/, '');
// Mark the inner image URL as seen so it doesn't get extracted as a raw URL
nestedImageUrls.add(cleanImageUrl);
// Also mark it in the seen set to prevent it from being extracted as a regular link
seen.add(cleanImageUrl);
if (cleanLinkUrl && cleanLinkUrl.match(/^https?:\/\//i) && !isNostrUrl(cleanLinkUrl) && !seen.has(cleanLinkUrl)) {
seen.add(cleanLinkUrl);
links.push({
url: cleanLinkUrl,
text: altText.trim() || 'Image link', // Use the alt text from the image (e.g., "Youtube link with pic")
isExternal: isExternalUrl(cleanLinkUrl, linkBaseURL),
});
}
}
// Now extract regular markdown links: [text](url) - but NOT images ![alt](url)
// Use a pattern that explicitly excludes images by checking before the match
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
let markdownMatch;
while ((markdownMatch = markdownLinkPattern.exec(processedContent)) !== null) {
// Check if this is an image (preceded by !)
// We need to check the character immediately before the opening bracket
const matchIndex = markdownMatch.index;
if (matchIndex > 0) {
const charBefore = processedContent[matchIndex - 1];
if (charBefore === '!') {
continue; // Skip images - this is ![alt](url), not [text](url)
}
}
let [, text, url] = markdownMatch;
// Skip if this is a nested image link (we already extracted those above)
if (text.trim().startsWith('![') && text.includes('](')) {
continue; // Already handled by nestedImageLinkPattern
}
// Handle AsciiDoc image syntax in markdown links: [image::url[alt,width=100%]](link-url)
// This happens when AsciiDoc content is converted to markdown-style links
if (text.trim().startsWith('image::') || text.trim().startsWith('image:')) {
// Match image::url[alt,attributes] or image:url[alt,attributes]
const imageMatch = text.match(/^image:?:[^\[]+\[([^\],]+)/);
if (imageMatch) {
text = imageMatch[1].trim(); // Use just the alt text (e.g., "Youtube link with pic")
} else {
// If we can't extract alt text, use a default
text = 'Image link';
}
}
// Clean up URL - remove trailing punctuation that might have been captured
// But preserve parentheses that are part of the URL (like in query strings)
// Only remove trailing punctuation that's clearly not part of the URL
url = url.trim();
// Remove trailing punctuation that's likely not part of the URL
// But be careful - URLs can end with ) if they're in markdown like [text](url))
// We'll be conservative and only remove if it's clearly punctuation
url = url.replace(/[)\].,;:!?`]+$/, '');
// Clean up text - remove stray punctuation and whitespace
text = text.trim();
// Skip if URL is empty or invalid
if (!url || !url.match(/^https?:\/\//i)) {
continue;
}
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
// Handle nested image links: link:url[image::image-url[alt,width=100%]]
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g;
let asciidocMatch;
while ((asciidocMatch = asciidocLinkPattern.exec(processedContent)) !== null) {
let [, url, text] = asciidocMatch;
// Clean up URL
url = url.trim();
// Handle nested image syntax in AsciiDoc: image::url[alt,width=100%]
// Extract just the alt text from the image syntax
if (text.trim().startsWith('image::') || text.trim().startsWith('image:')) {
// Match image::url[alt,attributes] or image:url[alt,attributes]
const imageMatch = text.match(/^image:?:[^\[]+\[([^\],]+)/);
if (imageMatch) {
text = imageMatch[1].trim(); // Use just the alt text
} else {
// If we can't extract alt text, skip this link (it's an image, not a text link)
continue;
}
}
// Clean up text
text = text.trim();
// Skip if URL is empty or invalid
if (!url || !url.match(/^https?:\/\//i)) {
continue;
}
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract raw URLs (basic pattern) - but exclude those already in markdown/asciidoc links
// More restrictive pattern to avoid capturing trailing punctuation
const urlPattern = /https?:\/\/[^\s<>"'`()\[\]]+/g;
const rawUrls = processedContent.match(urlPattern) || [];
rawUrls.forEach(url => {
// Remove trailing punctuation that might have been captured
url = url.replace(/[)\].,;:!?`]+$/, '');
// Skip if URL is too short or invalid
if (!url || url.length < 10 || !url.match(/^https?:\/\/[^\s]+$/i)) {
return;
}
// Skip if this is an inner image URL from a nested image link
if (nestedImageUrls.has(url)) {
return;
}
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text: url,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
});
return links;
}
/**
* Extract media URLs from content
*/
function extractMedia(content: string): string[] {
const media: string[] = [];
const seen = new Set<string>();
// Extract markdown images: ![alt](url) - optimized to avoid double matching
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
let markdownImageMatch;
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) {
const url = markdownImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
const asciidocImagePattern = /image::([^\[]+)\[/g;
let asciidocImageMatch;
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) {
const url = asciidocImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract raw image/video URLs
const urlPattern = /https?:\/\/[^\s<>"']+/g;
const rawUrls = content.match(urlPattern) || [];
rawUrls.forEach(url => {
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) {
media.push(url);
seen.add(url);
}
});
return media;
}
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
/**
* Normalize text to d-tag format
*/
function normalizeDtag(text: string): string {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Check if URL is external
*/
function isExternalUrl(url: string, linkBaseURL: string): boolean {
if (!linkBaseURL) return true;
try {
// Use a simple string-based check for Node.js compatibility
// Extract hostname from URL string
const urlMatch = url.match(/^https?:\/\/([^\/]+)/);
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch && baseMatch) {
return urlMatch[1] !== baseMatch[1];
}
return true;
} catch {
return true;
}
}
/**
* Check if URL is a Nostr URL
*/
function isNostrUrl(url: string): boolean {
return url.startsWith('nostr:') || getNostrType(url) !== null;
}
/**
* Check if URL is an image
*/
function isImageUrl(url: string): boolean {
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url);
}
/**
* Check if URL is a video
*/
function isVideoUrl(url: string): boolean {
return /\.(mp4|webm|ogg)$/i.test(url);
}

92
src/parser.js

@ -1,92 +0,0 @@ @@ -1,92 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Parser = void 0;
exports.defaultOptions = defaultOptions;
exports.process = process;
const detector_1 = require("./detector");
const to_asciidoc_1 = require("./converters/to-asciidoc");
const asciidoc_1 = require("./processors/asciidoc");
const metadata_1 = require("./extractors/metadata");
const frontmatter_1 = require("./extractors/frontmatter");
/**
* Default parser options
*/
function defaultOptions() {
return {
linkBaseURL: '',
enableAsciiDoc: true,
enableMarkdown: true,
enableCodeHighlighting: true,
enableLaTeX: true,
enableMusicalNotation: true,
enableNostrAddresses: true,
};
}
/**
* Main parser for Nostr event content
* Handles multiple content formats: AsciiDoc, Markdown, code syntax,
* LaTeX, musical notation, and nostr: prefixed addresses
*
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
class Parser {
constructor(options = {}) {
const defaults = defaultOptions();
this.options = {
linkBaseURL: options.linkBaseURL ?? defaults.linkBaseURL ?? '',
enableAsciiDoc: options.enableAsciiDoc ?? defaults.enableAsciiDoc ?? true,
enableMarkdown: options.enableMarkdown ?? defaults.enableMarkdown ?? true,
enableCodeHighlighting: options.enableCodeHighlighting ?? defaults.enableCodeHighlighting ?? true,
enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true,
enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true,
enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true,
wikilinkUrl: options.wikilinkUrl ?? defaults.wikilinkUrl,
hashtagUrl: options.hashtagUrl ?? defaults.hashtagUrl,
};
}
/**
* Process Nostr event content and return HTML
* Automatically detects the content format and processes accordingly
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
async process(content) {
// Extract frontmatter first (before any other processing)
const { frontmatter, content: contentWithoutFrontmatter } = (0, frontmatter_1.extractFrontmatter)(content);
// Extract metadata from content (after removing frontmatter)
const metadata = (0, metadata_1.extractMetadata)(contentWithoutFrontmatter, this.options.linkBaseURL);
// Detect content format (on content without frontmatter)
const format = (0, detector_1.detectFormat)(contentWithoutFrontmatter);
// Convert everything to AsciiDoc format first
const asciidocContent = (0, to_asciidoc_1.convertToAsciidoc)(contentWithoutFrontmatter, format, this.options.linkBaseURL, {
enableNostrAddresses: this.options.enableNostrAddresses,
});
// Process through AsciiDoctor
const result = await (0, asciidoc_1.processAsciidoc)(asciidocContent, {
enableCodeHighlighting: this.options.enableCodeHighlighting,
enableLaTeX: this.options.enableLaTeX,
enableMusicalNotation: this.options.enableMusicalNotation,
originalContent: contentWithoutFrontmatter, // Pass original for LaTeX detection
linkBaseURL: this.options.linkBaseURL, // Pass linkBaseURL for link processing
wikilinkUrl: this.options.wikilinkUrl, // Pass wikilink URL format
hashtagUrl: this.options.hashtagUrl, // Pass hashtag URL format
});
// Combine with extracted metadata and frontmatter
return {
...result,
frontmatter,
nostrLinks: metadata.nostrLinks,
wikilinks: metadata.wikilinks,
hashtags: metadata.hashtags,
links: metadata.links,
media: metadata.media,
};
}
}
exports.Parser = Parser;
/**
* Convenience function to process content with default options
*/
async function process(content, options) {
const parser = new Parser(options);
return parser.process(content);
}

233
src/parser.ts

@ -1,98 +1,127 @@ @@ -1,98 +1,127 @@
import { ParserOptions, ProcessResult, ContentFormat } from './types';
import { ParserOptions, ProcessResult, ContentFormat, Wikilink } from './types';
import { detectFormat } from './detector';
import { convertToAsciidoc } from './converters/to-asciidoc';
import { processAsciidoc } from './processors/asciidoc';
import { extractMetadata } from './extractors/metadata';
import { extractFrontmatter } from './extractors/frontmatter';
import { processAsciiDoc } from './processors/asciidoc';
import { processMarkdown } from './processors/markdown';
import { postProcess } from './post-processor';
import { preProcessAsciiDoc, restorePlaceholders } from './pre-processor';
/**
* Default parser options
*/
export function defaultOptions(): ParserOptions {
return {
linkBaseURL: '',
linkBaseURL: undefined,
enableAsciiDoc: true,
enableMarkdown: true,
enableCodeHighlighting: true,
enableLaTeX: true,
enableMusicalNotation: true,
enableNostrAddresses: true,
wikilinkUrl: undefined,
hashtagUrl: undefined
};
}
/**
* Main parser for Nostr event content
* Handles multiple content formats: AsciiDoc, Markdown, code syntax,
* LaTeX, musical notation, and nostr: prefixed addresses
*
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
* Handles multiple content formats: AsciiDoc, Markdown
* Post-processes wikilinks, hashtags, and nostr: addresses
*/
export class Parser {
private options: Required<Omit<ParserOptions, 'wikilinkUrl' | 'hashtagUrl'>> & Pick<ParserOptions, 'wikilinkUrl' | 'hashtagUrl'>;
constructor(options: ParserOptions = {}) {
const defaults = defaultOptions();
this.options = {
linkBaseURL: options.linkBaseURL ?? defaults.linkBaseURL ?? '',
enableAsciiDoc: options.enableAsciiDoc ?? defaults.enableAsciiDoc ?? true,
enableMarkdown: options.enableMarkdown ?? defaults.enableMarkdown ?? true,
enableCodeHighlighting: options.enableCodeHighlighting ?? defaults.enableCodeHighlighting ?? true,
enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true,
enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true,
enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true,
wikilinkUrl: options.wikilinkUrl ?? defaults.wikilinkUrl,
hashtagUrl: options.hashtagUrl ?? defaults.hashtagUrl,
};
private options: ParserOptions;
constructor(options?: ParserOptions) {
this.options = { ...defaultOptions(), ...options };
}
/**
* Process Nostr event content and return HTML
* Automatically detects the content format and processes accordingly
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
async process(content: string): Promise<ProcessResult> {
// Extract frontmatter first (before any other processing)
const { frontmatter, content: contentWithoutFrontmatter } = extractFrontmatter(content);
// Extract metadata from content (after removing frontmatter)
const metadata = extractMetadata(contentWithoutFrontmatter, this.options.linkBaseURL);
// Detect content format (on content without frontmatter)
const format = detectFormat(contentWithoutFrontmatter);
// Convert everything to AsciiDoc format first
const asciidocContent = convertToAsciidoc(
contentWithoutFrontmatter,
format,
this.options.linkBaseURL,
{
enableNostrAddresses: this.options.enableNostrAddresses,
}
);
// Process through AsciiDoctor
const result = await processAsciidoc(
asciidocContent,
{
enableCodeHighlighting: this.options.enableCodeHighlighting,
enableLaTeX: this.options.enableLaTeX,
enableMusicalNotation: this.options.enableMusicalNotation,
originalContent: contentWithoutFrontmatter, // Pass original for LaTeX detection
linkBaseURL: this.options.linkBaseURL, // Pass linkBaseURL for link processing
wikilinkUrl: this.options.wikilinkUrl, // Pass wikilink URL format
hashtagUrl: this.options.hashtagUrl, // Pass hashtag URL format
}
);
// Combine with extracted metadata and frontmatter
if (!content || content.trim().length === 0) {
return this.emptyResult();
}
// Detect format
const format = detectFormat(content);
// Process based on format
let html: string;
let tableOfContents = '';
let hasLaTeX = false;
let hasMusicalNotation = false;
let frontmatter: Record<string, any> | undefined;
let preProcessWikilinks: Wikilink[] = [];
let preProcessHashtags: string[] = [];
if (format === ContentFormat.AsciiDoc && this.options.enableAsciiDoc !== false) {
// Pre-process to handle wikilinks and hashtags before AsciiDoc conversion
const preProcessResult = preProcessAsciiDoc(content, this.options);
preProcessWikilinks = preProcessResult.wikilinks;
preProcessHashtags = preProcessResult.hashtags;
const result = processAsciiDoc(preProcessResult.content, this.options);
// Restore wikilinks and hashtags from placeholders
html = restorePlaceholders(result.html, preProcessResult.wikilinks, preProcessResult.hashtags, this.options);
tableOfContents = result.tableOfContents;
hasLaTeX = result.hasLaTeX;
hasMusicalNotation = result.hasMusicalNotation;
} else if (format === ContentFormat.Markdown && this.options.enableMarkdown !== false) {
const result = processMarkdown(content, this.options);
html = result.html;
frontmatter = result.frontmatter;
hasLaTeX = result.hasLaTeX;
hasMusicalNotation = result.hasMusicalNotation;
} else {
// Plain text or unknown format - just escape and wrap
html = `<p>${escapeHtml(content)}</p>`;
}
// Post-process for nostr: addresses and handle any remaining processing
// Note: wikilinks and hashtags are already processed for AsciiDoc
const postProcessResult = postProcess(html, this.options, format === ContentFormat.AsciiDoc);
// Extract additional metadata
const links = extractLinks(postProcessResult.html);
const media = extractMedia(postProcessResult.html);
// Merge pre-processed and post-processed wikilinks/hashtags
const allWikilinks = preProcessWikilinks.length > 0
? preProcessWikilinks
: postProcessResult.wikilinks;
const allHashtags = preProcessHashtags.length > 0
? preProcessHashtags
: postProcessResult.hashtags;
return {
...result,
content: postProcessResult.html,
tableOfContents,
hasLaTeX,
hasMusicalNotation,
frontmatter,
nostrLinks: metadata.nostrLinks,
wikilinks: metadata.wikilinks,
hashtags: metadata.hashtags,
links: metadata.links,
media: metadata.media,
nostrLinks: postProcessResult.nostrLinks,
wikilinks: allWikilinks,
hashtags: allHashtags,
links,
media
};
}
private emptyResult(): ProcessResult {
return {
content: '',
tableOfContents: '',
hasLaTeX: false,
hasMusicalNotation: false,
nostrLinks: [],
wikilinks: [],
hashtags: [],
links: [],
media: []
};
}
}
@ -104,3 +133,75 @@ export async function process(content: string, options?: ParserOptions): Promise @@ -104,3 +133,75 @@ export async function process(content: string, options?: ParserOptions): Promise
const parser = new Parser(options);
return parser.process(content);
}
/**
* Extract regular links from HTML
*/
function extractLinks(html: string): Array<{ url: string; text: string; isExternal: boolean }> {
const links: Array<{ url: string; text: string; isExternal: boolean }> = [];
const linkRegex = /<a[^>]+href=["']([^"']+)["'][^>]*>([^<]*)<\/a>/gi;
let match;
while ((match = linkRegex.exec(html)) !== null) {
const url = match[1];
const text = match[2] || url;
const isExternal = url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//');
// Skip nostr links, wikilinks, and hashtags (already extracted)
if (url.includes('nostr-') || url.includes('events?d=') || url.includes('data-topic')) {
continue;
}
links.push({ url, text, isExternal });
}
return links;
}
/**
* Extract media URLs from HTML
*/
function extractMedia(html: string): string[] {
const media: string[] = [];
// Extract image sources
const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
let match;
while ((match = imgRegex.exec(html)) !== null) {
media.push(match[1]);
}
// Extract video sources
const videoRegex = /<video[^>]+src=["']([^"']+)["'][^>]*>/gi;
while ((match = videoRegex.exec(html)) !== null) {
media.push(match[1]);
}
// Extract audio sources
const audioRegex = /<audio[^>]+src=["']([^"']+)["'][^>]*>/gi;
while ((match = audioRegex.exec(html)) !== null) {
media.push(match[1]);
}
// Extract source tags
const sourceRegex = /<source[^>]+src=["']([^"']+)["'][^>]*>/gi;
while ((match = sourceRegex.exec(html)) !== null) {
media.push(match[1]);
}
return media;
}
/**
* Escape HTML special characters
*/
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;'
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

481
src/post-processor.ts

@ -0,0 +1,481 @@ @@ -0,0 +1,481 @@
import { ParserOptions, NostrLink, Wikilink } from './types';
/**
* Extract and process wikilinks, hashtags, and nostr: addresses from HTML
*/
export interface PostProcessResult {
html: string;
nostrLinks: NostrLink[];
wikilinks: Wikilink[];
hashtags: string[];
}
/**
* Post-process HTML to convert wikilinks, hashtags, and nostr: addresses
* @param skipWikilinksAndHashtags - If true, skip processing wikilinks and hashtags (already processed)
*/
export function postProcess(html: string, options: ParserOptions, skipWikilinksAndHashtags: boolean = false): PostProcessResult {
let processed = html;
const nostrLinks: NostrLink[] = [];
const wikilinks: Wikilink[] = [];
const hashtags: string[] = [];
// First, mark code blocks to avoid processing inside them
const codeBlockMarkers: Array<{ start: number; end: number }> = [];
const codeBlockRegex = /<(pre|code)[^>]*>[\s\S]*?<\/\1>/gi;
let match;
while ((match = codeBlockRegex.exec(html)) !== null) {
codeBlockMarkers.push({ start: match.index, end: match.index + match[0].length });
}
function isInCodeBlock(index: number): boolean {
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end);
}
// Process nostr: addresses (but not in code blocks)
if (options.enableNostrAddresses !== false) {
const nostrRegex = /nostr:([np][a-z0-9]+1[a-z0-9]+)/gi;
const replacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = nostrRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
const bech32 = match[1];
const type = getNostrType(bech32);
if (!type) continue;
const link: NostrLink = {
type,
id: bech32,
text: match[0],
bech32: bech32
};
nostrLinks.push(link);
const url = options.linkBaseURL
? `${options.linkBaseURL}/nostr/${bech32}`
: `#nostr-${bech32}`;
replacements.push({
match: match[0],
replacement: `<a href="${escapeHtml(url)}" class="nostr-link" data-nostr-type="${type}" data-nostr-id="${escapeHtml(bech32)}">${escapeHtml(match[0])}</a>`,
index: match.index
});
}
// Apply replacements in reverse order to preserve indices
replacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
}
// Process wikilinks: [[dtag]] or [[dtag|display]] (but not in code blocks)
// Skip if already processed (for AsciiDoc)
if (!skipWikilinksAndHashtags) {
const wikilinkRegex = /\[\[([^\]]+)\]\]/g;
const wikilinkReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = wikilinkRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Skip if already inside a link tag
const beforeMatch = processed.substring(0, match.index);
const lastOpenTag = beforeMatch.lastIndexOf('<a');
const lastCloseTag = beforeMatch.lastIndexOf('</a>');
if (lastOpenTag > lastCloseTag) continue; // Inside a link
const content = match[1];
const parts = content.split('|');
const dtag = parts[0].trim();
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag;
const wikilink: Wikilink = {
dtag,
display,
original: match[0]
};
wikilinks.push(wikilink);
let url: string;
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(dtag);
} else if (typeof options.wikilinkUrl === 'string') {
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(dtag));
} else {
url = options.linkBaseURL
? `${options.linkBaseURL}/events?d=${encodeURIComponent(dtag)}`
: `#${encodeURIComponent(dtag)}`;
}
wikilinkReplacements.push({
match: match[0],
replacement: `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(dtag)}">${escapeHtml(display)}</a>`,
index: match.index
});
}
// Apply wikilink replacements in reverse order
wikilinkReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
// Process hashtags: #hashtag (but not in code blocks or inside HTML tags)
const hashtagRegex = /(^|\s|>)(#[\w-]+)/g;
const hashtagReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = hashtagRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Check if we're inside an HTML tag
const beforeMatch = processed.substring(0, match.index);
const lastOpenTag = beforeMatch.lastIndexOf('<');
const lastCloseTag = beforeMatch.lastIndexOf('>');
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
// Skip if already inside a link or span
const lastLinkOpen = beforeMatch.lastIndexOf('<a');
const lastLinkClose = beforeMatch.lastIndexOf('</a>');
const lastSpanOpen = beforeMatch.lastIndexOf('<span');
const lastSpanClose = beforeMatch.lastIndexOf('</span>');
if (lastLinkOpen > lastLinkClose || lastSpanOpen > lastSpanClose) continue;
const hashtag = match[2];
const prefix = match[1];
const topic = hashtag.substring(1);
if (!hashtags.includes(topic)) {
hashtags.push(topic);
}
let url: string | undefined;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(topic);
} else if (typeof options.hashtagUrl === 'string') {
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic));
}
const replacement = url
? `${prefix}<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>`
: `${prefix}<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`;
hashtagReplacements.push({
match: match[0],
replacement,
index: match.index
});
}
// Apply hashtag replacements in reverse order
hashtagReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
}
// Extract wikilinks and hashtags from already-processed HTML (for AsciiDoc)
if (skipWikilinksAndHashtags) {
// Extract wikilinks from existing links
const wikilinkLinkRegex = /<a[^>]+class="wikilink"[^>]+data-dtag="([^"]+)"[^>]*>([^<]+)<\/a>/g;
while ((match = wikilinkLinkRegex.exec(processed)) !== null) {
wikilinks.push({
dtag: match[1],
display: match[2],
original: match[0]
});
}
// Extract hashtags from existing spans/links
const hashtagRegex = /<(?:a|span)[^>]+class="hashtag"[^>]+data-topic="([^"]+)"[^>]*>#\1<\/\w+>/g;
while ((match = hashtagRegex.exec(processed)) !== null) {
const topic = match[1];
if (!hashtags.includes(topic)) {
hashtags.push(topic);
}
}
}
// Remove links inside code blocks (both <code> and <pre> tags)
// This ensures URLs in code blocks remain as plain text
const codeBlockLinkRegex = /(<(?:code|pre)[^>]*>)([\s\S]*?)(<\/(?:code|pre)>)/gi;
processed = processed.replace(codeBlockLinkRegex, (match, openTag, content, closeTag) => {
// Remove all <a> tags inside code blocks, keeping only the text content
const cleanedContent = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return openTag + cleanedContent + closeTag;
});
// Process YouTube URLs - ORDER IS CRITICAL to avoid double-parsing
// 1. FIRST: Fix video tags that contain YouTube URLs (before they get processed as bare URLs)
// AsciiDoc's video:: macro creates <video> tags, but YouTube URLs should be iframes
const youtubeVideoTagRegex = /<video[^>]+src="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>[\s\S]*?<\/video>/gi;
processed = processed.replace(youtubeVideoTagRegex, (match, url, videoId) => {
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 2. SECOND: Process YouTube links in <a> tags
// IMPORTANT: Be very specific with YouTube regex to avoid matching Spotify URLs
const youtubeLinkRegex = /<a[^>]+href="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>.*?<\/a>/gi;
processed = processed.replace(youtubeLinkRegex, (match, url, videoId) => {
if (isInCodeBlock(processed.indexOf(match))) return match;
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 3. THIRD: Fix malformed YouTube iframes from AsciiDoc video:: macro
// AsciiDoc sometimes creates iframes with malformed YouTube URLs (watch?v= or shorts/ instead of embed/)
// Match the entire iframe element including closing tag to avoid duplicates
const malformedYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube[^"]*(?:watch\?v=|shorts\/)([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi;
processed = processed.replace(malformedYoutubeIframeRegex, (match, videoId) => {
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 3.5: Fix YouTube iframes with embed URLs but wrong parameters or missing required attributes
// AsciiDoc's video:: macro creates iframes with ?rel=0 or missing allow/referrerpolicy attributes
// Match iframes with embed URLs that don't have enablejsapi=1 or are missing required attributes
const incompleteYoutubeIframeRegex = /<iframe[^>]+src="https?:\/\/(?:www\.)?youtube\.com\/embed\/([a-zA-Z0-9_-]+)(\?[^"]*)?"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi;
processed = processed.replace(incompleteYoutubeIframeRegex, (match, videoId, params) => {
// Check if this iframe already has the correct format (has enablejsapi=1 and required attributes)
if (match.includes('enablejsapi=1') &&
match.includes('allow=') &&
match.includes('referrerpolicy=') &&
match.includes('class="youtube-embed"')) {
return match; // Already correct, don't modify
}
// Fix the iframe with proper attributes
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 4. FOURTH: Fix any existing YouTube iframes that have malformed embed URLs (AsciiDoc sometimes creates broken embed URLs)
// Match the entire iframe element including closing tag to avoid duplicates
const brokenYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube\.com\/embed\/[^"]*watch\?v=([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi;
processed = processed.replace(brokenYoutubeIframeRegex, (match, videoId) => {
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 5. LAST: Handle bare YouTube URLs (not in links, video tags, or iframes)
// IMPORTANT: Match must be specific to youtube.com or youtu.be to avoid matching Spotify
// This must come AFTER processing video tags and links to avoid double-parsing
const bareYoutubeRegex = /(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)(?:\?[^"\s<>]*)?)/gi;
const youtubeReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = bareYoutubeRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Check if it's already in a tag (link, iframe, video, etc.)
// Simple approach: check if we're inside quotes (attribute value) or between <tag and >
const before = processed.substring(Math.max(0, match.index - 500), match.index);
const after = processed.substring(match.index, match.index + match[0].length + 100);
// Check if URL is inside quotes (attribute value like src="..." or href="...")
const beforeContext = before.substring(Math.max(0, before.length - 100));
if (beforeContext.match(/<(iframe|video|a|img|audio|source)[^>]*\s+(src|href)="[^"]*$/i)) {
continue; // Inside an attribute value, skip
}
// Check if we're between an opening tag and its closing bracket
const lastOpenTag = before.lastIndexOf('<');
const lastCloseBracket = before.lastIndexOf('>');
if (lastOpenTag > lastCloseBracket) {
// We're inside a tag, check what kind
const tagContent = before.substring(lastOpenTag);
if (/<(iframe|video|a|img|audio|source)[^>]*$/i.test(tagContent)) {
continue; // Skip URLs inside these tags
}
}
const videoId = match[2];
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
youtubeReplacements.push({
match: match[0],
replacement: `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`,
index: match.index
});
}
youtubeReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
// Fix double-closed iframes (safety net)
processed = processed.replace(/<\/iframe><\/iframe>/gi, '</iframe>');
// Spotify: https://open.spotify.com/episode/ID or https://open.spotify.com/track/ID or https://open.spotify.com/album/ID
const spotifyLinkRegex = /<a[^>]+href="(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+))[^"]*"[^>]*>.*?<\/a>/gi;
processed = processed.replace(spotifyLinkRegex, (match, url, type, id) => {
if (isInCodeBlock(processed.indexOf(match))) return match;
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`;
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`;
});
// Also handle bare Spotify URLs (not in links)
const bareSpotifyRegex = /(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)(?:\?[^"\s<>]*)?)/gi;
const spotifyReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = bareSpotifyRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Check if it's already in a tag
const before = processed.substring(0, match.index);
const lastOpenTag = before.lastIndexOf('<');
const lastCloseTag = before.lastIndexOf('>');
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
const type = match[2];
const id = match[3];
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`;
spotifyReplacements.push({
match: match[0],
replacement: `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`,
index: match.index
});
}
spotifyReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
// Process bare image/media URLs that aren't already in tags
// First, convert bare links (class="bare") that contain image/video/audio URLs to actual media elements
// This handles cases where AsciiDoc has already converted URLs to links
// IMPORTANT: Check YouTube FIRST, then Spotify, BEFORE checking file extensions to avoid conflicts
const bareLinkRegex = /<a[^>]+href="(https?:\/\/[^"]+)"[^>]*class="[^"]*bare[^"]*"[^>]*>([^<]*)<\/a>/gi;
processed = processed.replace(bareLinkRegex, (match, url, linkText) => {
if (isInCodeBlock(processed.indexOf(match))) return match;
// Check YouTube URLs FIRST (be very specific - must be youtube.com or youtu.be)
// This prevents accidentally matching Spotify URLs
const youtubeMatch = url.match(/https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)/);
if (youtubeMatch) {
const videoId = youtubeMatch[1];
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
}
// Check Spotify URLs (be very specific - must be open.spotify.com)
const spotifyMatch = url.match(/https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)/);
if (spotifyMatch) {
const type = spotifyMatch[1];
const id = spotifyMatch[2];
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`;
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`;
}
// Check if it's an image URL
if (/\.(jpg|jpeg|png|gif|webp|svg|bmp)(\?|$)/i.test(url)) {
return `<img src="${escapeHtml(url)}" alt="${escapeHtml(linkText)}" class="bare-image" />`;
}
// Check if it's a video URL (but not YouTube)
if (/\.(mp4|webm|ogg|mov|avi)(\?|$)/i.test(url)) {
return `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`;
}
// Check if it's an audio URL (but not Spotify)
if (/\.(mp3|wav|ogg|flac|aac|m4a)(\?|$)/i.test(url)) {
return `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`;
}
// Not a media URL, return as-is
return match;
});
// Now process bare URLs that aren't in any tags at all
// IMPORTANT: Skip YouTube and Spotify URLs - they're already processed above
const imageUrlRegex = /(https?:\/\/[^\s<>"']+\.(jpg|jpeg|png|gif|webp|svg|bmp))(?![^<]*>)/gi;
const videoUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp4|webm|ogg|mov|avi))(?![^<]*>)/gi;
const audioUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp3|wav|ogg|flac|aac|m4a))(?![^<]*>)/gi;
// Check if URL is already in a tag
function isUrlInTag(url: string, index: number): boolean {
const before = processed.substring(0, index);
const after = processed.substring(index);
// Check if it's inside an existing tag
const lastOpenTag = before.lastIndexOf('<');
const lastCloseTag = before.lastIndexOf('>');
if (lastOpenTag > lastCloseTag) {
const tagContent = processed.substring(lastOpenTag, index + url.length);
if (/<(img|video|audio|a|source|iframe)[^>]*>/i.test(tagContent)) {
return true;
}
}
return false;
}
const mediaReplacements: Array<{ match: string; replacement: string; index: number }> = [];
// Process images
while ((match = imageUrlRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
if (isUrlInTag(match[0], match.index)) continue;
const url = match[0];
mediaReplacements.push({
match: url,
replacement: `<img src="${escapeHtml(url)}" alt="" class="bare-image" />`,
index: match.index
});
}
// Process videos (but skip YouTube URLs - they're handled above)
while ((match = videoUrlRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
if (isUrlInTag(match[0], match.index)) continue;
// Skip YouTube URLs - they should be embeds, not video tags
if (/youtube\.com|youtu\.be/i.test(match[0])) continue;
const url = match[0];
mediaReplacements.push({
match: url,
replacement: `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`,
index: match.index
});
}
// Process audio
while ((match = audioUrlRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
if (isUrlInTag(match[0], match.index)) continue;
const url = match[0];
mediaReplacements.push({
match: url,
replacement: `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`,
index: match.index
});
}
// Apply media replacements in reverse order
mediaReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
return {
html: processed,
nostrLinks,
wikilinks,
hashtags
};
}
/**
* Get Nostr identifier type from bech32 string
*/
function getNostrType(bech32: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (bech32.startsWith('npub')) return 'npub';
if (bech32.startsWith('nprofile')) return 'nprofile';
if (bech32.startsWith('nevent')) return 'nevent';
if (bech32.startsWith('naddr')) return 'naddr';
if (bech32.startsWith('note')) return 'note';
return null;
}
/**
* Escape HTML special characters
*/
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;'
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

175
src/pre-processor.ts

@ -0,0 +1,175 @@ @@ -0,0 +1,175 @@
import { ParserOptions, Wikilink } from './types';
import * as emoji from 'node-emoji';
/**
* Pre-process raw content to handle wikilinks and hashtags before AsciiDoc conversion
* This prevents AsciiDoc from converting them to anchors or other formats
*/
export interface PreProcessResult {
content: string;
wikilinks: Wikilink[];
hashtags: string[];
}
/**
* Pre-process content to convert wikilinks and hashtags to placeholders
* that will be processed after HTML conversion
*/
export function preProcessAsciiDoc(content: string, options: ParserOptions): PreProcessResult {
let processed = content;
const wikilinks: Wikilink[] = [];
const hashtags: string[] = [];
// Process emojis first
processed = emoji.emojify(processed);
// Process wikilinks: [[dtag]] or [[dtag|display]]
// Replace with a placeholder that AsciiDoc won't touch
const wikilinkRegex = /\[\[([^\]]+)\]\]/g;
const wikilinkPlaceholders: Map<string, Wikilink> = new Map();
let placeholderCounter = 0;
processed = processed.replace(wikilinkRegex, (match, content) => {
const parts = content.split('|');
const dtag = parts[0].trim();
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag;
const wikilink: Wikilink = {
dtag,
display,
original: match
};
wikilinks.push(wikilink);
// Use a unique placeholder that won't be processed by AsciiDoc
// Use angle brackets to avoid AsciiDoc formatting interpretation
const placeholder = `<WIKILINK_PLACEHOLDER_${placeholderCounter}>`;
wikilinkPlaceholders.set(placeholder, wikilink);
placeholderCounter++;
return placeholder;
});
// Process hashtags: #hashtag (but not in code blocks)
// Mark code blocks first
const codeBlockMarkers: Array<{ start: number; end: number }> = [];
const codeBlockRegex = /\[source,[^\]]+\]|\[abc\]|\[plantuml\]|```|`[^`]+`/g;
let match;
while ((match = codeBlockRegex.exec(processed)) !== null) {
// Find the end of the code block
const start = match.index;
let end = start + match[0].length;
// For source blocks, find the closing ----
if (match[0].startsWith('[source')) {
const afterStart = processed.substring(end);
const closeMatch = afterStart.match(/^[\s\S]*?----/);
if (closeMatch) {
end = start + match[0].length + closeMatch[0].length;
}
}
codeBlockMarkers.push({ start, end });
}
function isInCodeBlock(index: number): boolean {
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end);
}
// Process hashtags
const hashtagPlaceholders: Map<string, string> = new Map();
let hashtagCounter = 0;
// Match hashtags at start of line, after whitespace, or after > (for blockquotes)
const hashtagRegex = /(^|\s|>)(#[\w-]+)/gm;
processed = processed.replace(hashtagRegex, (match, prefix, hashtag, offset) => {
if (isInCodeBlock(offset)) return match;
const topic = hashtag.substring(1);
if (!hashtags.includes(topic)) {
hashtags.push(topic);
}
// Use angle brackets to avoid AsciiDoc formatting interpretation
const placeholder = `<HASHTAG_PLACEHOLDER_${hashtagCounter}>`;
hashtagPlaceholders.set(placeholder, topic);
hashtagCounter++;
return `${prefix}${placeholder}`;
});
return {
content: processed,
wikilinks,
hashtags
};
}
/**
* Restore wikilinks and hashtags from placeholders in HTML
*/
export function restorePlaceholders(
html: string,
wikilinks: Wikilink[],
hashtags: string[],
options: ParserOptions
): string {
let processed = html;
// Restore wikilinks (handle both escaped and unescaped placeholders)
const wikilinkPlaceholderRegex = /&lt;WIKILINK_PLACEHOLDER_(\d+)&gt;|<WIKILINK_PLACEHOLDER_(\d+)>/g;
processed = processed.replace(wikilinkPlaceholderRegex, (match, escapedIndex, unescapedIndex) => {
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex);
const wikilink = wikilinks[index];
if (!wikilink) return match;
let url: string;
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(wikilink.dtag);
} else if (typeof options.wikilinkUrl === 'string') {
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(wikilink.dtag));
} else {
url = options.linkBaseURL
? `${options.linkBaseURL}/events?d=${encodeURIComponent(wikilink.dtag)}`
: `#${encodeURIComponent(wikilink.dtag)}`;
}
return `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(wikilink.dtag)}">${escapeHtml(wikilink.display)}</a>`;
});
// Restore hashtags (handle both escaped and unescaped placeholders)
const hashtagPlaceholderRegex = /&lt;HASHTAG_PLACEHOLDER_(\d+)&gt;|<HASHTAG_PLACEHOLDER_(\d+)>/g;
processed = processed.replace(hashtagPlaceholderRegex, (match, escapedIndex, unescapedIndex) => {
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex);
const topic = hashtags[index];
if (!topic) return match;
let url: string | undefined;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(topic);
} else if (typeof options.hashtagUrl === 'string') {
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic));
}
const hashtag = `#${topic}`;
if (url) {
return `<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>`;
} else {
return `<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`;
}
});
return processed;
}
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;'
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

148
src/processors/asciidoc.js

@ -1,148 +0,0 @@ @@ -1,148 +0,0 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.processAsciidoc = processAsciidoc;
const core_1 = __importDefault(require("@asciidoctor/core"));
const html_utils_1 = require("./html-utils");
const html_postprocess_1 = require("./html-postprocess");
const asciidoctorInstance = (0, core_1.default)();
/**
* Processes AsciiDoc content to HTML using AsciiDoctor
* Uses AsciiDoctor's built-in highlight.js and LaTeX support
*/
async function processAsciidoc(content, options = {}) {
const { enableCodeHighlighting = true, enableLaTeX = true, enableMusicalNotation = true, } = options;
// Check if content starts with level 3+ headers
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
// If content starts with level 3+, use book doctype
const firstHeaderMatch = content.match(/^(={1,6})\s+/m);
let doctype = 'article';
if (firstHeaderMatch) {
const firstHeaderLevel = firstHeaderMatch[1].length;
if (firstHeaderLevel >= 3) {
doctype = 'book';
}
}
try {
const result = asciidoctorInstance.convert(content, {
safe: 'safe',
backend: 'html5',
doctype: doctype,
attributes: {
'showtitle': true,
'sectanchors': true,
'sectlinks': true,
'toc': 'left',
'toclevels': 6,
'toc-title': 'Table of Contents',
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none',
'stem': enableLaTeX ? 'latexmath' : 'none',
'plantuml': 'plantuml', // Enable PlantUML diagram support
'data-uri': true,
'imagesdir': '',
'linkcss': false,
'stylesheet': '',
'stylesdir': '',
'prewrap': true,
'sectnums': false,
'sectnumlevels': 6,
'experimental': true,
'compat-mode': false,
'attribute-missing': 'warn',
'attribute-undefined': 'warn',
'skip-front-matter': true,
'source-indent': 0,
'indent': 0,
'tabsize': 2,
'tabwidth': 2,
'hardbreaks': false,
'paragraph-rewrite': 'normal',
'sectids': true,
'idprefix': '',
'idseparator': '-',
'sectidprefix': '',
'sectidseparator': '-'
}
});
const htmlString = typeof result === 'string' ? result : result.toString();
// Extract table of contents from HTML
const { toc, contentWithoutTOC } = (0, html_utils_1.extractTOC)(htmlString);
// Sanitize HTML to prevent XSS
const sanitized = (0, html_utils_1.sanitizeHTML)(contentWithoutTOC);
// Post-process HTML: convert macros to HTML, add styling, etc.
const processed = (0, html_postprocess_1.postProcessHtml)(sanitized, {
enableMusicalNotation,
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
// Process links: add target="_blank" to external links
const processedWithLinks = options.linkBaseURL
? (0, html_utils_1.processLinks)(processed, options.linkBaseURL)
: processed;
// Also process TOC
const tocSanitized = (0, html_utils_1.sanitizeHTML)(toc);
const tocProcessed = (0, html_postprocess_1.postProcessHtml)(tocSanitized, {
enableMusicalNotation: false, // Don't process music in TOC
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
// Process links in TOC as well
const tocProcessedWithLinks = options.linkBaseURL
? (0, html_utils_1.processLinks)(tocProcessed, options.linkBaseURL)
: tocProcessed;
// Check for LaTeX in original content (more reliable than checking HTML)
const contentToCheck = options.originalContent || content;
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck);
// Check for musical notation in processed HTML
const hasMusicalNotation = enableMusicalNotation && (/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed));
return {
content: processedWithLinks,
tableOfContents: tocProcessedWithLinks,
hasLaTeX,
hasMusicalNotation,
nostrLinks: [], // Will be populated by metadata extraction
wikilinks: [],
hashtags: [],
links: [],
media: [],
};
}
catch (error) {
// Fallback to plain text with error logging
const errorMessage = error instanceof Error ? error.message : String(error);
// Use process.stderr.write for Node.js compatibility instead of console.error
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const nodeProcess = globalThis.process;
if (nodeProcess?.stderr) {
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`);
}
// Escape HTML in content for safe display
const escapedContent = (0, html_utils_1.sanitizeHTML)(content);
return {
content: `<p>${escapedContent}</p>`,
tableOfContents: '',
hasLaTeX: false,
hasMusicalNotation: false,
nostrLinks: [],
wikilinks: [],
hashtags: [],
links: [],
media: [],
};
}
}
/**
* Check if content has LaTeX math
* Based on jumble's detection pattern
*/
function hasMathContent(content) {
// Check for inline math: $...$ or \(...\)
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content);
// Check for block math: $$...$$ or \[...\]
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content);
return inlineMath || blockMath;
}

209
src/processors/asciidoc.ts

@ -1,193 +1,56 @@ @@ -1,193 +1,56 @@
import { ProcessResult } from '../types';
import { extractTOC, sanitizeHTML, processLinks } from './html-utils';
import { postProcessHtml } from './html-postprocess';
import asciidoctor from '@asciidoctor/core';
import { ParserOptions } from '../types';
import * as emoji from 'node-emoji';
// Lazy-load AsciiDoctor instance to avoid issues with Jest module transformation
// Use require() for CommonJS modules to avoid Jest transformation issues
let asciidoctorInstance: any = null;
function getAsciidoctorInstance() {
if (!asciidoctorInstance) {
// Use require() instead of import() to avoid Jest transformation issues with Opal runtime
// eslint-disable-next-line @typescript-eslint/no-require-imports
const asciidoctor = require('@asciidoctor/core');
asciidoctorInstance = asciidoctor.default();
}
return asciidoctorInstance;
}
export interface ProcessOptions {
enableCodeHighlighting?: boolean;
enableLaTeX?: boolean;
enableMusicalNotation?: boolean;
originalContent?: string; // Original content for LaTeX detection
linkBaseURL?: string; // Base URL for link processing
wikilinkUrl?: string | ((dtag: string) => string); // Custom URL format for wikilinks
hashtagUrl?: string | ((topic: string) => string); // Custom URL format for hashtags
export interface AsciiDocResult {
html: string;
tableOfContents: string;
hasLaTeX: boolean;
hasMusicalNotation: boolean;
}
/**
* Processes AsciiDoc content to HTML using AsciiDoctor
* Uses AsciiDoctor's built-in highlight.js and LaTeX support
* Process AsciiDoc content to HTML
*/
export async function processAsciidoc(
content: string,
options: ProcessOptions = {}
): Promise<ProcessResult> {
const {
enableCodeHighlighting = true,
enableLaTeX = true,
enableMusicalNotation = true,
} = options;
export function processAsciiDoc(content: string, options: ParserOptions): AsciiDocResult {
const hasLaTeX = /\[source,latex\]|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content);
const hasMusicalNotation = /\[abc\]|\[source,abc\]/i.test(content);
// Check if content starts with level 3+ headers
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
// If content starts with level 3+, use book doctype
const firstHeaderMatch = content.match(/^(={1,6})\s+/m);
let doctype: 'article' | 'book' = 'article';
// Process emojis before AsciiDoc conversion
const processedContent = emoji.emojify(content);
if (firstHeaderMatch) {
const firstHeaderLevel = firstHeaderMatch[1].length;
if (firstHeaderLevel >= 3) {
doctype = 'book';
}
}
try {
const instance = getAsciidoctorInstance();
const result = instance.convert(content, {
safe: 'safe',
backend: 'html5',
doctype: doctype,
const asciidoctorOptions: any = {
safe: 'unsafe',
attributes: {
'showtitle': true,
'sectanchors': true,
'sectlinks': true,
'icons': 'font',
'source-highlighter': options.enableCodeHighlighting !== false ? 'highlight.js' : undefined,
'highlightjs-theme': 'github',
'toc': 'left',
'toclevels': 6,
'toc-title': 'Table of Contents',
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none',
'stem': enableLaTeX ? 'latexmath' : 'none',
'plantuml': 'plantuml', // Enable PlantUML diagram support
'data-uri': true,
'imagesdir': '',
'linkcss': false,
'stylesheet': '',
'stylesdir': '',
'prewrap': true,
'sectnums': false,
'sectnumlevels': 6,
'experimental': true,
'compat-mode': false,
'attribute-missing': 'warn',
'attribute-undefined': 'warn',
'skip-front-matter': true,
'source-indent': 0,
'indent': 0,
'tabsize': 2,
'tabwidth': 2,
'hardbreaks': false,
'paragraph-rewrite': 'normal',
'sectids': true,
'idprefix': '',
'idseparator': '-',
'sectidprefix': '',
'sectidseparator': '-'
'sectanchors': true,
'sectlinks': true,
'idprefix': '_',
'idseparator': '_'
}
});
const htmlString = typeof result === 'string' ? result : result.toString();
// Extract table of contents from HTML
const { toc, contentWithoutTOC } = extractTOC(htmlString);
// Sanitize HTML to prevent XSS
const sanitized = sanitizeHTML(contentWithoutTOC);
// Post-process HTML: convert macros to HTML, add styling, etc.
const processed = postProcessHtml(sanitized, {
enableMusicalNotation,
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
// Process links: add target="_blank" to external links
const processedWithLinks = options.linkBaseURL
? processLinks(processed, options.linkBaseURL)
: processed;
// Also process TOC
const tocSanitized = sanitizeHTML(toc);
const tocProcessed = postProcessHtml(tocSanitized, {
enableMusicalNotation: false, // Don't process music in TOC
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
};
// Process links in TOC as well
const tocProcessedWithLinks = options.linkBaseURL
? processLinks(tocProcessed, options.linkBaseURL)
: tocProcessed;
// Convert to HTML
const Asciidoctor = asciidoctor();
const htmlResult = Asciidoctor.convert(processedContent, asciidoctorOptions);
const html = typeof htmlResult === 'string' ? htmlResult : htmlResult.toString();
// Check for LaTeX in original content (more reliable than checking HTML)
const contentToCheck = options.originalContent || content;
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck);
// Extract table of contents if present
const tocMatch = html.match(/<div id="toc"[^>]*>([\s\S]*?)<\/div>/);
const tableOfContents = tocMatch ? tocMatch[1] : '';
// Check for musical notation in processed HTML
const hasMusicalNotation = enableMusicalNotation && (
/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed)
);
// Remove TOC from main content if present
const contentWithoutToc = html.replace(/<div id="toc"[^>]*>[\s\S]*?<\/div>/, '');
return {
content: processedWithLinks,
tableOfContents: tocProcessedWithLinks,
html: contentWithoutToc,
tableOfContents,
hasLaTeX,
hasMusicalNotation,
nostrLinks: [], // Will be populated by metadata extraction
wikilinks: [],
hashtags: [],
links: [],
media: [],
};
} catch (error) {
// Fallback to plain text with error logging
const errorMessage = error instanceof Error ? error.message : String(error);
// Use process.stderr.write for Node.js compatibility instead of console.error
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const nodeProcess = (globalThis as any).process;
if (nodeProcess?.stderr) {
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`);
}
// Escape HTML in content for safe display
const escapedContent = sanitizeHTML(content);
return {
content: `<p>${escapedContent}</p>`,
tableOfContents: '',
hasLaTeX: false,
hasMusicalNotation: false,
nostrLinks: [],
wikilinks: [],
hashtags: [],
links: [],
media: [],
hasMusicalNotation
};
}
}
/**
* Check if content has LaTeX math
* Based on jumble's detection pattern
*/
function hasMathContent(content: string): boolean {
// Check for inline math: $...$ or \(...\)
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content);
// Check for block math: $$...$$ or \[...\]
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content);
return inlineMath || blockMath;
}

693
src/processors/html-postprocess.js

@ -1,693 +0,0 @@ @@ -1,693 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.postProcessHtml = postProcessHtml;
const music_1 = require("./music");
/**
* Post-processes HTML output from AsciiDoctor
* Converts AsciiDoc macros to HTML with data attributes and CSS classes
*/
function postProcessHtml(html, options = {}) {
let processed = html;
// Convert bookstr markers to HTML placeholders
processed = processed.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => {
const escaped = bookContent.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`;
});
// Convert hashtag links to HTML
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
// HTML escape the display text
const escapedDisplay = displayText
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// If hashtagUrl is configured, make it a clickable link
if (options.hashtagUrl) {
let url;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(normalizedHashtag);
}
else {
// String template with {topic} placeholder
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag);
}
// Escape URL for HTML attribute
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${normalizedHashtag.replace(/"/g, '&quot;')}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
}
else {
// Default: Use span instead of <a> tag - same color as links but no underline and not clickable
return `<span class="hashtag-link">${escapedDisplay}</span>`;
}
});
// Convert WIKILINK:dtag|display placeholder format to HTML
// Match WIKILINK:dtag|display, ensuring we don't match across HTML tags
processed = processed.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => {
const escapedDtag = dTag.trim().replace(/"/g, '&quot;');
const escapedDisplay = displayText.trim()
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Generate URL using custom format or default
let url;
if (options.wikilinkUrl) {
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(dTag.trim());
}
else {
// String template with {dtag} placeholder
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim());
}
}
else {
// Default format
url = `/events?d=${escapedDtag}`;
}
// Escape URL for HTML attribute
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
});
// Convert any leftover link: macros that AsciiDoctor didn't convert
// This MUST run before processOpenGraphLinks which removes "link:" prefixes
// This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars)
// Pattern: link:url[text] where url is http/https and text can contain any characters
// Match link: macros that are still in the HTML as plain text (not converted by AsciiDoctor)
// Also handle HTML-escaped versions that might appear
processed = processed.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => {
// Unescape if already HTML-escaped (but be careful not to unescape actual content)
let unescapedUrl = url;
// Only unescape if it looks like it was escaped (contains &amp; or &quot;)
if (url.includes('&amp;') || url.includes('&quot;') || url.includes('&#39;')) {
unescapedUrl = url
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
}
let unescapedText = text;
// Only unescape if it looks like it was escaped
if (text.includes('&amp;') || text.includes('&lt;') || text.includes('&gt;') || text.includes('&quot;') || text.includes('&#39;')) {
unescapedText = text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
}
// Escape URL for HTML attribute (fresh escape, no double-escaping)
const escapedUrl = unescapedUrl
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Escape text content for HTML (fresh escape, no double-escaping)
const escapedText = unescapedText
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
const isRelayUrl = /wss?:\/\//i.test(unescapedText);
if (isRelayUrl) {
// Simple link without OpenGraph wrapper
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
}
else {
// Regular link - will be processed by OpenGraph handler if external
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
}
});
// Convert nostr: links to HTML
processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => {
const nostrType = getNostrType(bech32Id);
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') {
// Render as embedded event placeholder
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`;
}
else if (nostrType === 'npub' || nostrType === 'nprofile') {
// Render as user handle
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<span class="user-handle" data-pubkey="${escaped}">@${displayText}</span>`;
}
else {
// Fallback to regular link
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${displayText}</a>`;
}
});
// Process media URLs (YouTube, Spotify, video, audio)
processed = processMedia(processed);
// Fix double-escaped quotes in href attributes FIRST (before any other processing)
// This fixes href="&quot;url&quot;" -> href="url"
processed = processed.replace(/href\s*=\s*["']&quot;(https?:\/\/[^"']+)&quot;["']/gi, (_match, url) => {
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `href="${escapedUrl}"`;
});
// Process OpenGraph links (external links that should have rich previews)
processed = processOpenGraphLinks(processed, options.linkBaseURL);
// Process images: add max-width styling and data attributes
processed = processImages(processed);
// Process musical notation if enabled
if (options.enableMusicalNotation) {
processed = (0, music_1.processMusicalNotation)(processed);
}
// Clean up any escaped HTML that appears as text (e.g., &lt;a href=...&gt;)
// This can happen when AsciiDoctor escapes link macros that it couldn't parse
// Pattern: &lt;a href="url"&gt;text&lt;/a&gt; should be converted to actual HTML
// Use a more flexible pattern that handles text with special characters like ://
// Fix regular escaped HTML links
processed = processed.replace(/&lt;a\s+href=["'](https?:\/\/[^"']+)["']\s*&gt;([^<]+)&lt;\/a&gt;/gi, (_match, url, text) => {
// Unescape the URL and text
const unescapedUrl = url
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
const unescapedText = text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>');
// Re-escape properly for HTML
const escapedUrl = unescapedUrl
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
const escapedText = unescapedText
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
// Check if link text contains wss:// or ws:// - these are relay URLs
const isRelayUrl = /wss?:\/\//i.test(unescapedText);
if (isRelayUrl) {
// Simple link without OpenGraph wrapper
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
}
else {
// Regular link
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
}
});
// Clean up any leftover markdown syntax
processed = cleanupMarkdown(processed);
// Add styling classes
processed = addStylingClasses(processed);
// Hide raw ToC text
processed = hideRawTocText(processed);
return processed;
}
/**
* Get Nostr identifier type
*/
function getNostrType(id) {
if (id.startsWith('npub'))
return 'npub';
if (id.startsWith('nprofile'))
return 'nprofile';
if (id.startsWith('nevent'))
return 'nevent';
if (id.startsWith('naddr'))
return 'naddr';
if (id.startsWith('note'))
return 'note';
return null;
}
/**
* Process media URLs (YouTube, Spotify, video, audio)
* Converts MEDIA: placeholders to HTML embeds/players
*/
function processMedia(html) {
let processed = html;
// Process YouTube embeds
processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => {
const escapedId = videoId.replace(/"/g, '&quot;');
return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
<iframe
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
src="https://www.youtube.com/embed/${escapedId}"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
allowfullscreen
loading="lazy">
</iframe>
</div>`;
});
// Process Spotify embeds
processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => {
const escapedType = type.replace(/"/g, '&quot;');
const escapedId = id.replace(/"/g, '&quot;');
return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
<iframe
style="border-radius: 12px; width: 100%; max-width: 100%;"
src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
width="100%"
height="352"
frameborder="0"
allowfullscreen=""
allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
loading="lazy">
</iframe>
</div>`;
});
// Process video files
processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
const escapedUrl = url
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
return `<div class="media-embed video-embed" style="margin: 1rem 0;">
<video
controls
preload="metadata"
style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;"
class="media-player">
<source src="${escapedUrl}" type="video/mp4">
Your browser does not support the video tag.
</video>
</div>`;
});
// Process audio files
processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
const escapedUrl = url
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
<audio
controls
preload="metadata"
style="width: 100%; max-width: 100%;"
class="media-player">
<source src="${escapedUrl}">
Your browser does not support the audio tag.
</audio>
</div>`;
});
return processed;
}
/**
* Process OpenGraph links - mark external links for OpenGraph preview fetching
*/
function processOpenGraphLinks(html, linkBaseURL) {
// First, clean up any corrupted HTML fragments that might interfere
// Remove "link:" prefixes that appear before links (AsciiDoc syntax that shouldn't be in HTML)
// This happens when AsciiDoctor doesn't fully convert link:url[text] syntax or when
// there's literal text like "should render like link:" before an anchor tag
let processed = html;
// Remove "link:" that appears immediately before anchor tags (most common case)
// Match "link:" followed by optional whitespace and then <a
processed = processed.replace(/link:\s*<a/gi, '<a');
// Remove "link:" that appears as plain text in HTML (shouldn't be there)
// Be careful not to match "link:" inside HTML attributes or tags
// Match "link:" that's not inside quotes or tags
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2');
// Also handle cases where "link:" appears with whitespace before anchor tags
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' ');
// Clean up any corrupted href attributes that contain HTML fragments or double-escaped quotes
// Fix href attributes with escaped quotes: href="&quot;url&quot;" -> href="url"
processed = processed.replace(/href\s*=\s*["']&quot;(https?:\/\/[^"']+)&quot;["']/gi, (match, url) => {
// Extract the clean URL and properly escape it
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `href="${escapedUrl}"`;
});
// Clean up href attributes that contain HTML fragments
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => {
// If href contains HTML tags, extract just the URL part
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i);
if (urlMatch) {
const escapedUrl = urlMatch[1].replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `href="${escapedUrl}"`;
}
return match; // If we can't fix it, leave it (will be skipped by validation)
});
// Clean up any malformed anchor tag fragments that might cause issues
processed = processed.replace(/<a\s+href=["']([^"'>]*<[^"'>]*)["']/gi, (match, corruptedHref) => {
// Skip corrupted anchor tags - they'll be handled by the main regex with validation
return match;
});
// Clean up links inside code blocks - AsciiDoctor creates them but they should be plain text
// Remove <a> tags inside <code> blocks, keeping only the link text
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match, content) => {
// Remove any <a> tags inside code blocks, keeping only the text content
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return `<code>${cleaned}</code>`;
});
// Also clean up links inside pre blocks
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match, content) => {
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return `<pre>${cleaned}</pre>`;
});
// Now protect code blocks and pre blocks by replacing them with placeholders
const codeBlockPlaceholders = [];
const preBlockPlaceholders = [];
// Replace pre blocks first (they can contain code blocks)
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => {
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`;
preBlockPlaceholders.push(match);
return placeholder;
});
// Replace code blocks
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => {
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Extract base domain from linkBaseURL if provided
let baseDomain = null;
if (linkBaseURL) {
try {
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch) {
baseDomain = urlMatch[1];
}
}
catch {
// Ignore parsing errors
}
}
// Before processing, remove any corrupted opengraph containers that might have been created
// These have malformed data-og-url attributes containing HTML fragments
// Match all spans with data-og-url and check if they're corrupted
// Use a pattern that matches spans with data-og-url, then check the attribute value
processed = processed.replace(/<span[^>]*data-og-url=["']([^"']+)["'][^>]*>[\s\S]*?<\/span>/gi, (match) => {
// This span has a corrupted data-og-url (contains <)
// Extract the clean URL from the beginning of the attribute value
const dataOgUrlMatch = match.match(/data-og-url=["']([^"']+)["']/i);
if (dataOgUrlMatch && dataOgUrlMatch[1]) {
// Extract just the URL part (everything before the first <)
const urlMatch = dataOgUrlMatch[1].match(/(https?:\/\/[^\s<>"']+)/i);
if (urlMatch) {
const cleanUrl = urlMatch[1];
// Extract the link text from inside the span
const linkMatch = match.match(/<a[^>]*>(.*?)<\/a>/i);
const linkText = linkMatch ? linkMatch[1] : cleanUrl;
// Return a clean opengraph container with the fixed URL
const escapedUrl = cleanUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<div class="opengraph-preview" data-og-loading="true" style="display: none;">
<div class="opengraph-card">
<div class="opengraph-image-container">
<img class="opengraph-image" src="" alt="" style="display: none;" />
</div>
<div class="opengraph-content">
<div class="opengraph-site"></div>
<div class="opengraph-title"></div>
<div class="opengraph-description"></div>
</div>
</div>
</div>
</span>`;
}
// If we can't extract a clean URL, just remove the corrupted span and keep any text
const textMatch = match.match(/>([^<]+)</);
return textMatch ? textMatch[1] : '';
}
return match; // Keep valid spans
});
// Match external links (http/https) that aren't media, nostr, or wikilinks
// Skip links that are already in media embeds or special containers
// Use a stricter regex that only matches valid, complete anchor tags
// The regex must match a complete <a> tag with proper structure
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => {
// CRITICAL: Validate href FIRST - if it contains ANY HTML tags or fragments, skip immediately
// This prevents corrupted HTML from being created
if (!href) {
return match; // Skip if no href
}
// Skip if href contains HTML tags or looks corrupted - be very strict
// Check for common HTML fragments that indicate corruption
if (href.includes('<') || href.includes('>') || href.includes('href=') || href.includes('</a>') || href.includes('<a') || href.includes('"') || href.includes("'")) {
return match; // Skip if href looks corrupted
}
// Additional validation: href should only contain URL-safe characters
// URLs shouldn't contain unescaped quotes or HTML tags
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
return match; // Skip if href doesn't match clean URL pattern
}
// Validate href is a proper URL (starts with http:// or https:// and doesn't contain invalid chars)
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
return match; // Skip if href doesn't match URL pattern
}
// Skip if the match contains unclosed tags or corrupted HTML
const openATags = (match.match(/<a\s/g) || []).length;
const closeATags = (match.match(/<\/a>/g) || []).length;
if (openATags !== closeATags || openATags !== 1) {
return match; // Multiple or mismatched <a> tags = corrupted
}
// Skip if match contains nested HTML that looks corrupted
if (match.includes('href="') && match.split('href="').length > 2) {
return match; // Multiple href attributes = corrupted
}
// Skip if it's already a media embed, nostr link, wikilink, or opengraph link
if (match.includes('class="wikilink"') ||
match.includes('class="nostr-link"') ||
match.includes('class="opengraph-link"') ||
match.includes('data-embedded-note') ||
match.includes('youtube-embed') ||
match.includes('spotify-embed') ||
match.includes('media-embed') ||
match.includes('opengraph-link-container')) {
return match;
}
// Skip if it's a media file URL
if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) {
return match;
}
// Skip if it's YouTube or Spotify (already handled as media)
if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) {
return match;
}
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
// They don't need OpenGraph previews
if (/wss?:\/\//i.test(linkText)) {
return match;
}
// Check if it's an external link (not same domain)
let isExternal = true;
if (baseDomain) {
try {
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/);
if (hrefMatch && hrefMatch[1] === baseDomain) {
isExternal = false;
}
}
catch {
// If parsing fails, assume external
}
}
// Only process external links
if (!isExternal) {
return match;
}
// Escape the URL for data attribute
const escapedUrl = href
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Add data attribute for OpenGraph fetching and wrap in container
// The actual OpenGraph fetching will be done client-side via JavaScript
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<div class="opengraph-preview" data-og-loading="true" style="display: none;">
<div class="opengraph-card">
<div class="opengraph-image-container">
<img class="opengraph-image" src="" alt="" style="display: none;" />
</div>
<div class="opengraph-content">
<div class="opengraph-site"></div>
<div class="opengraph-title"></div>
<div class="opengraph-description"></div>
</div>
</div>
</div>
</span>`;
});
// Restore code blocks
codeBlockPlaceholders.forEach((codeBlock, index) => {
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock);
});
// Restore pre blocks
preBlockPlaceholders.forEach((preBlock, index) => {
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock);
});
return processed;
}
/**
* Process images: add max-width styling and data attributes
*/
function processImages(html) {
const imageUrls = [];
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
let match;
while ((match = imageUrlRegex.exec(html)) !== null) {
const url = match[1];
if (url && !imageUrls.includes(url)) {
imageUrls.push(url);
}
}
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => {
const srcMatch = attributes.match(/src=["']([^"']+)["']/i);
if (!srcMatch)
return imgTag;
const src = srcMatch[1];
const currentIndex = imageUrls.indexOf(src);
let updatedAttributes = attributes;
if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => {
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
: 'max-w-[400px] object-contain cursor-zoom-in';
return `class="${newClasses}"`;
});
}
else {
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`;
}
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '&quot;')}"`;
return `<img${updatedAttributes}>`;
});
}
/**
* Clean URL by removing tracking parameters
* Based on jumble's cleanUrl function
*/
function cleanUrl(url) {
try {
const parsedUrl = new URL(url);
// List of tracking parameter prefixes and exact names to remove
const trackingParams = [
// Google Analytics & Ads
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
// Facebook
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
// Twitter/X
'twclid', 'twsrc',
// Microsoft/Bing
'msclkid', 'mc_cid', 'mc_eid',
// Adobe
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
// Mailchimp
'mc_cid', 'mc_eid',
// HubSpot
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
// Marketo
'mkt_tok',
// YouTube
'si', 'feature', 'kw', 'pp',
// Other common tracking
'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
// Mobile app tracking
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
// Amazon
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
// Affiliate tracking
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
// Social media share tracking
'share', 'shared', 'sharesource'
];
// Remove all tracking parameters
trackingParams.forEach(param => {
parsedUrl.searchParams.delete(param);
});
// Remove any parameter that starts with utm_ or _
Array.from(parsedUrl.searchParams.keys()).forEach(key => {
if (key.startsWith('utm_') || key.startsWith('_')) {
parsedUrl.searchParams.delete(key);
}
});
return parsedUrl.toString();
}
catch {
// If URL parsing fails, return original URL
return url;
}
}
/**
* Clean up leftover markdown syntax
*/
function cleanupMarkdown(html) {
let cleaned = html;
// Clean up markdown image syntax
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const altText = alt || '';
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Escape for HTML attribute
const escapedUrl = cleanedUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`;
});
// Clean up markdown link syntax
// Skip if the link is already inside an HTML tag or is part of escaped HTML
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
// Skip if this markdown link is already inside an HTML tag
// Check if there's an <a> tag nearby that might have been created from this
if (cleaned.includes(`href="${url}"`) || cleaned.includes(`href='${url}'`)) {
return _match;
}
// Skip if the text contains HTML entities or looks like it's already processed
if (text.includes('&lt;') || text.includes('&gt;') || text.includes('&amp;')) {
return _match;
}
// Skip if the URL is already in an href attribute (check for escaped versions too)
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
if (cleaned.includes(`href="${escapedUrl}"`) || cleaned.includes(`href='${escapedUrl}'`)) {
return _match;
}
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Escape for HTML attribute (but don't double-escape)
const finalEscapedUrl = cleanedUrl
.replace(/&amp;/g, '&') // Unescape if already escaped
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Escape text for HTML (but don't double-escape)
const escapedText = text
.replace(/&amp;/g, '&') // Unescape if already escaped
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
return `<a href="${finalEscapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
return cleaned;
}
/**
* Add proper CSS classes for styling
*/
function addStylingClasses(html) {
let styled = html;
// Add strikethrough styling
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>');
// Add subscript styling
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>');
// Add superscript styling
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>');
// Add code highlighting classes
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">');
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">');
return styled;
}
/**
* Hide raw AsciiDoc ToC text
*/
function hideRawTocText(html) {
let cleaned = html;
cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, '');
cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, '');
cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, '');
return cleaned;
}

599
src/processors/html-postprocess.ts

@ -1,599 +0,0 @@ @@ -1,599 +0,0 @@
import { processMusicalNotation } from './music';
export interface PostProcessOptions {
enableMusicalNotation?: boolean;
linkBaseURL?: string;
/** Custom URL format for wikilinks */
wikilinkUrl?: string | ((dtag: string) => string);
/** Custom URL format for hashtags */
hashtagUrl?: string | ((topic: string) => string);
}
/**
* Post-processes HTML output from AsciiDoctor
*
* Processing order (critical for correct rendering):
* 1. Convert placeholders to HTML (BOOKSTR, hashtags, wikilinks, nostr links, media, link macros)
* 2. Fix corrupted HTML (double-escaped quotes, escaped HTML as text, broken links)
* 3. Process OpenGraph links (external links with previews)
* 4. Process images (add styling)
* 5. Process musical notation
* 6. Clean up leftover markdown syntax
* 7. Add styling classes
* 8. Hide raw ToC text
*/
export function postProcessHtml(html: string, options: PostProcessOptions = {}): string {
let processed = html;
// ============================================
// STEP 1: Convert placeholders to HTML
// ============================================
processed = convertBookstrMarkers(processed);
processed = convertHashtags(processed, options);
processed = convertWikilinks(processed, options);
processed = convertNostrLinks(processed);
processed = convertMediaPlaceholders(processed);
processed = convertLinkMacros(processed);
// ============================================
// STEP 2: Fix corrupted HTML
// ============================================
processed = fixDoubleEscapedQuotes(processed);
processed = fixEscapedHtmlLinks(processed);
processed = fixBrokenLinkPatterns(processed);
// ============================================
// STEP 3: Process OpenGraph links
// ============================================
processed = processOpenGraphLinks(processed, options.linkBaseURL);
// ============================================
// STEP 4: Process images
// ============================================
processed = processImages(processed);
// ============================================
// STEP 5: Process musical notation
// ============================================
if (options.enableMusicalNotation) {
processed = processMusicalNotation(processed);
}
// ============================================
// STEP 6: Clean up leftover markdown
// ============================================
processed = cleanupMarkdown(processed);
// ============================================
// STEP 7: Add styling classes
// ============================================
processed = addStylingClasses(processed);
// ============================================
// STEP 8: Hide raw ToC text
// ============================================
processed = hideRawTocText(processed);
return processed;
}
// ============================================
// STEP 1: Convert placeholders to HTML
// ============================================
/**
* Convert BOOKSTR markers to HTML placeholders
*/
function convertBookstrMarkers(html: string): string {
return html.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => {
const escaped = escapeHtmlAttr(bookContent);
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`;
});
}
/**
* Convert hashtag placeholders to HTML
*/
function convertHashtags(html: string, options: PostProcessOptions): string {
return html.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
const escapedDisplay = escapeHtml(displayText);
if (options.hashtagUrl) {
let url: string;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(normalizedHashtag);
} else {
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag);
}
const escapedUrl = escapeHtmlAttr(url);
const escapedTopic = escapeHtmlAttr(normalizedHashtag);
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${escapedTopic}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
} else {
return `<span class="hashtag-link">${escapedDisplay}</span>`;
}
});
}
/**
* Convert wikilink placeholders to HTML
*/
function convertWikilinks(html: string, options: PostProcessOptions): string {
return html.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => {
const escapedDtag = escapeHtmlAttr(dTag.trim());
const escapedDisplay = escapeHtml(displayText.trim());
let url: string;
if (options.wikilinkUrl) {
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(dTag.trim());
} else {
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim());
}
} else {
url = `/events?d=${escapedDtag}`;
}
const escapedUrl = escapeHtmlAttr(url);
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
});
}
/**
* Convert nostr: links to HTML
*/
function convertNostrLinks(html: string): string {
return html.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => {
const nostrType = getNostrType(bech32Id);
const escaped = escapeHtmlAttr(bech32Id);
const escapedDisplay = escapeHtml(displayText);
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') {
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`;
} else if (nostrType === 'npub' || nostrType === 'nprofile') {
return `<span class="user-handle" data-pubkey="${escaped}">@${escapedDisplay}</span>`;
} else {
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${escapedDisplay}</a>`;
}
});
}
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
/**
* Convert media placeholders to HTML embeds
*/
function convertMediaPlaceholders(html: string): string {
let processed = html;
// YouTube embeds
processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => {
const escapedId = escapeHtmlAttr(videoId);
return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
<iframe
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
src="https://www.youtube.com/embed/${escapedId}"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
allowfullscreen
loading="lazy">
</iframe>
</div>`;
});
// Spotify embeds
processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => {
const escapedType = escapeHtmlAttr(type);
const escapedId = escapeHtmlAttr(id);
return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
<iframe
style="border-radius: 12px; width: 100%; max-width: 100%;"
src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
width="100%"
height="352"
frameborder="0"
allowfullscreen=""
allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
loading="lazy">
</iframe>
</div>`;
});
// Video files
processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
const escapedUrl = escapeHtmlAttr(url);
return `<div class="media-embed video-embed" style="margin: 1rem 0;">
<video
controls
preload="metadata"
style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;"
class="media-player">
<source src="${escapedUrl}" type="video/mp4">
Your browser does not support the video tag.
</video>
</div>`;
});
// Audio files
processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
const escapedUrl = escapeHtmlAttr(url);
return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
<audio
controls
preload="metadata"
style="width: 100%; max-width: 100%;"
class="media-player">
<source src="${escapedUrl}">
Your browser does not support the audio tag.
</audio>
</div>`;
});
return processed;
}
/**
* Convert link: macros that AsciiDoctor didn't convert
* This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars)
*/
function convertLinkMacros(html: string): string {
return html.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => {
// Unescape if already HTML-escaped
const unescapedUrl = unescapeHtml(url);
const unescapedText = unescapeHtml(text);
// Re-escape properly for HTML
const escapedUrl = escapeHtmlAttr(unescapedUrl);
const escapedText = escapeHtml(unescapedText);
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
const isRelayUrl = /wss?:\/\//i.test(unescapedText);
// Create link (OpenGraph processing will handle it later if needed)
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
}
// ============================================
// STEP 2: Fix corrupted HTML
// ============================================
/**
* Fix double-escaped quotes in href attributes: href="&quot;url&quot;" -> href="url"
*/
function fixDoubleEscapedQuotes(html: string): string {
return html.replace(/href\s*=\s*["']&quot;(https?:\/\/[^"']+)&quot;["']/gi, (_match, url) => {
const escapedUrl = escapeHtmlAttr(url);
return `href="${escapedUrl}"`;
});
}
/**
* Fix escaped HTML links: &lt;a href="..."&gt;text&lt;/a&gt; -> <a href="...">text</a>
*/
function fixEscapedHtmlLinks(html: string): string {
return html.replace(/&lt;a\s+href=["'](https?:\/\/[^"']+)["']\s*&gt;([^<]+)&lt;\/a&gt;/gi, (_match, url, text) => {
const unescapedUrl = unescapeHtml(url);
const unescapedText = unescapeHtml(text);
const escapedUrl = escapeHtmlAttr(unescapedUrl);
const escapedText = escapeHtml(unescapedText);
const isRelayUrl = /wss?:\/\//i.test(unescapedText);
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
}
/**
* Fix broken link patterns where attributes appear as text before escaped HTML
* Pattern: " target=...&gt;&lt;a href=...&gt;text&lt;/a&gt;
*/
function fixBrokenLinkPatterns(html: string): string {
return html.replace(/"\s+target=["'][^"']*["']\s+rel=["'][^"']*["']\s+class=["'][^"']*["']\s*&gt;&lt;a\s+href=["'](https?:\/\/[^"']+)["']\s*&gt;([^<]+)&lt;\/a&gt;/gi, (_match, url, text) => {
const unescapedUrl = unescapeHtml(url);
const unescapedText = unescapeHtml(text);
const escapedUrl = escapeHtmlAttr(unescapedUrl);
const escapedText = escapeHtml(unescapedText);
const isRelayUrl = /wss?:\/\//i.test(unescapedText);
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
}
// ============================================
// STEP 3: Process OpenGraph links
// ============================================
/**
* Process OpenGraph links - mark external links for OpenGraph preview fetching
*/
function processOpenGraphLinks(html: string, linkBaseURL?: string): string {
let processed = html;
// Remove "link:" prefixes that might appear before anchor tags
processed = processed.replace(/link:\s*<a/gi, '<a');
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2');
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' ');
// Clean up corrupted href attributes
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => {
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i);
if (urlMatch) {
const escapedUrl = escapeHtmlAttr(urlMatch[1]);
return `href="${escapedUrl}"`;
}
return match;
});
// Protect code blocks and pre blocks
const codeBlockPlaceholders: string[] = [];
const preBlockPlaceholders: string[] = [];
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => {
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`;
preBlockPlaceholders.push(match);
return placeholder;
});
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => {
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Extract base domain
let baseDomain: string | null = null;
if (linkBaseURL) {
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch) {
baseDomain = urlMatch[1];
}
}
// Process external links
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => {
// Validate href
if (!href || href.includes('<') || href.includes('>') || !/^https?:\/\/[^\s<>"']+$/i.test(href)) {
return match;
}
// Skip if already processed
if (match.includes('class="wikilink"') ||
match.includes('class="nostr-link"') ||
match.includes('class="opengraph-link"') ||
match.includes('data-embedded-note') ||
match.includes('media-embed') ||
match.includes('opengraph-link-container')) {
return match;
}
// Skip media files
if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) {
return match;
}
// Skip YouTube/Spotify (already handled as media)
if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) {
return match;
}
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
if (/wss?:\/\//i.test(linkText)) {
return match;
}
// Check if external
let isExternal = true;
if (baseDomain) {
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/);
if (hrefMatch && hrefMatch[1] === baseDomain) {
isExternal = false;
}
}
if (!isExternal) {
return match;
}
// Wrap in OpenGraph container
const escapedUrl = escapeHtmlAttr(href);
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<div class="opengraph-preview" data-og-loading="true" style="display: none;">
<div class="opengraph-card">
<div class="opengraph-image-container">
<img class="opengraph-image" src="" alt="" style="display: none;" />
</div>
<div class="opengraph-content">
<div class="opengraph-site"></div>
<div class="opengraph-title"></div>
<div class="opengraph-description"></div>
</div>
</div>
</div>
</span>`;
});
// Restore code blocks
codeBlockPlaceholders.forEach((codeBlock, index) => {
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock);
});
preBlockPlaceholders.forEach((preBlock, index) => {
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock);
});
return processed;
}
// ============================================
// STEP 4: Process images
// ============================================
/**
* Process images: add max-width styling and data attributes
*/
function processImages(html: string): string {
const imageUrls: string[] = [];
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
let match;
while ((match = imageUrlRegex.exec(html)) !== null) {
const url = match[1];
if (url && !imageUrls.includes(url)) {
imageUrls.push(url);
}
}
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => {
const srcMatch = attributes.match(/src=["']([^"']+)["']/i);
if (!srcMatch) return imgTag;
const src = srcMatch[1];
const currentIndex = imageUrls.indexOf(src);
let updatedAttributes = attributes;
if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => {
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
: 'max-w-[400px] object-contain cursor-zoom-in';
return `class="${newClasses}"`;
});
} else {
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`;
}
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${escapeHtmlAttr(src)}"`;
return `<img${updatedAttributes}>`;
});
}
// ============================================
// STEP 6: Clean up leftover markdown
// ============================================
/**
* Clean up leftover markdown syntax
*/
function cleanupMarkdown(html: string): string {
let cleaned = html;
// Clean up markdown image syntax
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const altText = alt || '';
const escapedUrl = escapeHtmlAttr(url);
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`;
});
// Clean up markdown link syntax (skip if already HTML)
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
// Skip if already processed
if (cleaned.includes(`href="${url}"`) || cleaned.includes(`href='${url}'`)) {
return _match;
}
if (text.includes('&lt;') || text.includes('&gt;') || text.includes('&amp;')) {
return _match;
}
const escapedUrl = escapeHtmlAttr(url);
const escapedText = escapeHtml(text);
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
return cleaned;
}
// ============================================
// STEP 7: Add styling classes
// ============================================
/**
* Add proper CSS classes for styling
*/
function addStylingClasses(html: string): string {
let styled = html;
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>');
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>');
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>');
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">');
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">');
return styled;
}
// ============================================
// STEP 8: Hide raw ToC text
// ============================================
/**
* Hide raw AsciiDoc ToC text
*/
function hideRawTocText(html: string): string {
let cleaned = html;
cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, '');
cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, '');
cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, '');
return cleaned;
}
// ============================================
// Utility functions
// ============================================
/**
* Escape HTML content
*/
function escapeHtml(text: string): string {
return text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
}
/**
* Escape HTML attribute value
*/
function escapeHtmlAttr(text: string): string {
return text
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
}
/**
* Unescape HTML entities
*/
function unescapeHtml(text: string): string {
return text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
}

239
src/processors/html-utils.js

@ -1,239 +0,0 @@ @@ -1,239 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractTOC = extractTOC;
exports.sanitizeHTML = sanitizeHTML;
exports.processLinks = processLinks;
/**
* Extracts the table of contents from AsciiDoc HTML output
* Returns the TOC HTML and the content HTML without the TOC
*/
function extractTOC(html) {
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
let tocContent = '';
let contentWithoutTOC = html;
// Find the start of the TOC div - try multiple patterns
const tocStartPatterns = [
/<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>/i,
/<div\s+id=["']toc["'][^>]*>/i,
/<div\s+class=["']toc["'][^>]*>/i,
/<nav\s+id=["']toc["'][^>]*>/i,
];
let tocStartIdx = -1;
let tocStartTag = '';
for (const pattern of tocStartPatterns) {
const match = html.match(pattern);
if (match && match.index !== undefined) {
tocStartIdx = match.index;
tocStartTag = match[0];
break;
}
}
if (tocStartIdx === -1) {
// No TOC found
return { toc: '', contentWithoutTOC: html };
}
// Find the matching closing tag by counting div/nav tags
const searchStart = tocStartIdx + tocStartTag.length;
let depth = 1;
let i = searchStart;
while (i < html.length && depth > 0) {
// Look for opening or closing div/nav tags
if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') {
// Check if it's a closing tag
if (i + 5 < html.length && html[i + 4] === '/') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
i = closeIdx + 1;
}
else {
// Opening tag - find the end (handle attributes and self-closing)
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
// Check if it's self-closing (look for /> before the >)
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
}
}
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
i = closeIdx + 1;
}
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
i = closeIdx + 1;
}
else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') {
// Handle opening nav tags
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
}
else {
i++;
}
}
if (depth === 0) {
// Found the matching closing tag
const tocEndIdx = i;
// Extract the TOC content (inner HTML)
const tocFullHTML = html.substring(tocStartIdx, tocEndIdx);
// Extract just the inner content (without the outer div tags)
let innerStart = tocStartTag.length;
let innerEnd = tocFullHTML.length;
// Find the last </div> or </nav>
if (tocFullHTML.endsWith('</div>')) {
innerEnd -= 6;
}
else if (tocFullHTML.endsWith('</nav>')) {
innerEnd -= 7;
}
tocContent = tocFullHTML.substring(innerStart, innerEnd).trim();
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
tocContent = tocContent.replace(/<div\s+id=["']toctitle["'][^>]*>.*?<\/div>\s*/gis, '');
tocContent = tocContent.trim();
// Remove the TOC from the content
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx);
}
// Extract just the body content if the HTML includes full document structure
// AsciiDoctor might return full HTML with <html>, <head>, <body> tags
// Check if this is a full HTML document
const isFullDocument = /^\s*<!DOCTYPE|^\s*<html/i.test(contentWithoutTOC);
if (isFullDocument) {
// Extract body content using a more robust approach
// Find the opening <body> tag
const bodyStartMatch = contentWithoutTOC.match(/<body[^>]*>/i);
if (bodyStartMatch && bodyStartMatch.index !== undefined) {
const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length;
// Find the closing </body> tag by searching backwards from the end
// This is more reliable than regex for nested content
const bodyEndMatch = contentWithoutTOC.lastIndexOf('</body>');
if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) {
contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim();
}
}
}
// Remove any remaining document structure tags that might have slipped through
contentWithoutTOC = contentWithoutTOC
.replace(/<html[^>]*>/gi, '')
.replace(/<\/html>/gi, '')
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '')
.replace(/<body[^>]*>/gi, '')
.replace(/<\/body>/gi, '');
// Clean up any extra whitespace
contentWithoutTOC = contentWithoutTOC.trim();
return { toc: tocContent, contentWithoutTOC };
}
/**
* Performs basic HTML sanitization to prevent XSS
*/
function sanitizeHTML(html) {
// Remove script tags and their content
html = html.replace(/<script[^>]*>.*?<\/script>/gis, '');
// Remove event handlers (onclick, onerror, etc.)
html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, '');
// Remove javascript: protocol in links
html = html.replace(/javascript:/gi, '');
// Remove data: URLs that could be dangerous
html = html.replace(/data:\s*text\/html/gi, '');
return html;
}
/**
* Processes HTML links to add target="_blank" to external links
* This function is available for use but not currently called automatically.
* It can be used in post-processing if needed.
*/
function processLinks(html, linkBaseURL) {
// Extract domain from linkBaseURL for comparison
let linkBaseDomain = '';
if (linkBaseURL) {
try {
// Use URL constructor if available (Node.js 10+)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = globalThis.URL;
if (URLConstructor) {
const url = new URLConstructor(linkBaseURL);
linkBaseDomain = url.hostname;
}
else {
throw new Error('URL not available');
}
}
catch {
// Fallback to simple string parsing if URL constructor fails
const url = linkBaseURL.replace(/^https?:\/\//, '');
const parts = url.split('/');
if (parts.length > 0) {
linkBaseDomain = parts[0];
}
}
}
// Regex to match <a> tags with href attributes
const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g;
return html.replace(linkRegex, (match, before, href, after) => {
// Check if it's an external link (starts with http:// or https://)
const isExternal = href.startsWith('http://') || href.startsWith('https://');
if (isExternal) {
// Check if it's pointing to our own domain
if (linkBaseDomain) {
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = globalThis.URL;
if (URLConstructor) {
const hrefUrl = new URLConstructor(href);
if (hrefUrl.hostname === linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
else {
throw new Error('URL not available');
}
}
catch {
// If URL parsing fails, use simple string check
if (href.includes(linkBaseDomain)) {
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
}
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
if (!match.includes('target=')) {
if (!match.includes('rel=')) {
return match.replace('>', ' target="_blank" rel="noopener noreferrer">');
}
else {
// Update existing rel attribute to include noopener if not present
const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => {
if (!relValue.includes('noopener')) {
return `rel="${relValue} noopener noreferrer"`;
}
return relMatch;
});
return updatedMatch.replace('>', ' target="_blank">');
}
}
}
else {
// Local/relative link - ensure it opens in same tab (remove target if present)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
return match;
});
}

164
src/processors/html-utils.ts

@ -1,164 +0,0 @@ @@ -1,164 +0,0 @@
/**
* HTML utility functions for processing AsciiDoctor output
*
* Functions:
* - extractTOC: Extract table of contents from HTML
* - sanitizeHTML: Sanitize HTML to prevent XSS attacks
* - processLinks: Add target="_blank" to external links
*/
export interface TOCResult {
toc: string;
contentWithoutTOC: string;
}
/**
* Extract table of contents from AsciiDoctor HTML output
* AsciiDoctor generates a <div id="toc"> with class="toc" containing the TOC
*/
export function extractTOC(html: string): TOCResult {
// Match the TOC div - AsciiDoctor generates it with id="toc" and class="toc"
const tocMatch = html.match(/<div[^>]*id=["']toc["'][^>]*>([\s\S]*?)<\/div>/i);
if (tocMatch) {
const toc = tocMatch[0]; // Full TOC div
const contentWithoutTOC = html.replace(toc, '').trim();
return { toc, contentWithoutTOC };
}
// Fallback: try to match by class="toc"
const tocClassMatch = html.match(/<div[^>]*class=["'][^"']*toc[^"']*["'][^>]*>([\s\S]*?)<\/div>/i);
if (tocClassMatch) {
const toc = tocClassMatch[0];
const contentWithoutTOC = html.replace(toc, '').trim();
return { toc, contentWithoutTOC };
}
// No TOC found
return {
toc: '',
contentWithoutTOC: html,
};
}
/**
* Sanitize HTML to prevent XSS attacks
* Removes dangerous scripts and event handlers while preserving safe HTML
*
* This is a basic sanitizer. For production use, consider using a library like DOMPurify
*/
export function sanitizeHTML(html: string): string {
let sanitized = html;
// Remove script tags and their content
sanitized = sanitized.replace(/<script[\s\S]*?<\/script>/gi, '');
// Remove event handlers from attributes (onclick, onerror, etc.)
sanitized = sanitized.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, '');
sanitized = sanitized.replace(/\s*on\w+\s*=\s*[^\s>]*/gi, '');
// Remove javascript: protocol in href and src attributes
sanitized = sanitized.replace(/href\s*=\s*["']javascript:[^"']*["']/gi, 'href="#"');
sanitized = sanitized.replace(/src\s*=\s*["']javascript:[^"']*["']/gi, 'src=""');
// Remove data: URLs that might contain scripts (allow images)
// This is more permissive - you might want to be stricter
sanitized = sanitized.replace(/src\s*=\s*["']data:text\/html[^"']*["']/gi, 'src=""');
// Remove iframe with dangerous sources
sanitized = sanitized.replace(/<iframe[^>]*src\s*=\s*["']javascript:[^"']*["'][^>]*>[\s\S]*?<\/iframe>/gi, '');
// Remove object and embed tags (often used for XSS)
sanitized = sanitized.replace(/<object[\s\S]*?<\/object>/gi, '');
sanitized = sanitized.replace(/<embed[\s\S]*?>/gi, '');
// Remove style tags with potentially dangerous content
// We keep style attributes but remove <style> tags
sanitized = sanitized.replace(/<style[\s\S]*?<\/style>/gi, '');
// Remove link tags with javascript: or data: URLs
sanitized = sanitized.replace(/<link[^>]*href\s*=\s*["'](javascript|data):[^"']*["'][^>]*>/gi, '');
// Remove meta tags with http-equiv="refresh" (can be used for redirects)
sanitized = sanitized.replace(/<meta[^>]*http-equiv\s*=\s*["']refresh["'][^>]*>/gi, '');
return sanitized;
}
/**
* Process links to add target="_blank" and rel="noreferrer noopener" to external links
*
* External links are links that don't match the base domain.
* Internal links (same domain) are left unchanged.
*/
export function processLinks(html: string, linkBaseURL: string): string {
if (!linkBaseURL) {
return html;
}
// Extract base domain from linkBaseURL
let baseDomain: string | null = null;
try {
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch) {
baseDomain = urlMatch[1];
}
} catch {
// If parsing fails, don't process links
return html;
}
if (!baseDomain) {
return html;
}
// Process anchor tags with href attributes
return html.replace(/<a\s+([^>]*\s+)?href\s*=\s*["']([^"']+)["']([^>]*?)>/gi, (match, before, href, after) => {
// Skip if already has target attribute
if (match.includes('target=')) {
return match;
}
// Skip if it's not an http/https link
if (!/^https?:\/\//i.test(href)) {
return match;
}
// Skip if it's already a special link type (nostr, wikilink, etc.)
if (match.includes('class="nostr-link"') ||
match.includes('class="wikilink"') ||
match.includes('class="hashtag-link"')) {
return match;
}
// Check if it's an external link
let isExternal = true;
try {
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/);
if (hrefMatch && hrefMatch[1] === baseDomain) {
isExternal = false;
}
} catch {
// If parsing fails, assume external
}
// Only add target="_blank" to external links
if (isExternal) {
// Check if there's already a rel attribute
if (match.includes('rel=')) {
// Add to existing rel attribute if it doesn't already have noreferrer noopener
if (!match.includes('noreferrer') && !match.includes('noopener')) {
return match.replace(/rel\s*=\s*["']([^"']+)["']/i, 'rel="$1 noreferrer noopener"');
}
// Add target="_blank" before the closing >
return match.replace(/>$/, ' target="_blank">');
} else {
// Add both target and rel
return match.replace(/>$/, ' target="_blank" rel="noreferrer noopener">');
}
}
return match;
});
}

93
src/processors/markdown.ts

@ -0,0 +1,93 @@ @@ -0,0 +1,93 @@
import { marked } from 'marked';
// @ts-ignore - marked is ESM but we need it to work in Jest
import { ParserOptions } from '../types';
import * as emoji from 'node-emoji';
export interface MarkdownResult {
html: string;
frontmatter?: Record<string, any>;
hasLaTeX: boolean;
hasMusicalNotation: boolean;
}
/**
* Extract YAML frontmatter from markdown content
*/
function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } {
const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/;
const match = content.match(frontmatterRegex);
if (!match) {
return { content };
}
try {
// Simple YAML parser for basic key-value pairs
const yamlContent = match[1];
const frontmatter: Record<string, any> = {};
const lines = yamlContent.split('\n');
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith('#')) continue;
const colonIndex = trimmed.indexOf(':');
if (colonIndex === -1) continue;
const key = trimmed.substring(0, colonIndex).trim();
let value = trimmed.substring(colonIndex + 1).trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
// Handle arrays (simple case)
if (value.startsWith('[') && value.endsWith(']')) {
const arrayContent = value.slice(1, -1);
frontmatter[key] = arrayContent.split(',').map(v => v.trim().replace(/^["']|["']$/g, ''));
} else {
frontmatter[key] = value;
}
}
return {
frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined,
content: content.substring(match[0].length)
};
} catch (e) {
return { content };
}
}
/**
* Process Markdown content to HTML (minimal markdown support)
*/
export function processMarkdown(content: string, options: ParserOptions): MarkdownResult {
// Extract frontmatter
const { frontmatter, content: contentWithoutFrontmatter } = extractFrontmatter(content);
// Detect LaTeX and musical notation
const hasLaTeX = /```latex|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content);
const hasMusicalNotation = /```abc|```music/i.test(content);
// Configure marked for minimal markdown
marked.setOptions({
gfm: true,
breaks: false
});
// Process emoji shortcodes before markdown processing
let processedContent = emoji.emojify(contentWithoutFrontmatter);
// Convert markdown to HTML
const html = marked.parse(processedContent) as string;
return {
html,
frontmatter,
hasLaTeX,
hasMusicalNotation
};
}

143
src/processors/music.js

@ -1,143 +0,0 @@ @@ -1,143 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.processMusicalNotation = processMusicalNotation;
/**
* Processes musical notation in HTML content
* Wraps musical notation in appropriate HTML for rendering
*/
function processMusicalNotation(html) {
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
// These were created by a buggy regex that matched the entire HTML document
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => {
// This is corrupted - extract just the ABC notation from the beginning
let decoded = dataAbc
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// Find the actual ABC notation (starts with X:)
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|&lt;|<\/|sect|div|pre|code)/);
if (abcMatch) {
const cleanAbc = abcMatch[1].trim();
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`;
}
// If we can't extract clean ABC, remove the div entirely
return content;
});
// Clean up code blocks that contain corrupted abc-notation divs inside them
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i);
if (longDataAbcMatch) {
// Extract just the ABC notation from the beginning of the corrupted data-abc value
let decoded = longDataAbcMatch[1]
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// The ABC notation ends where the HTML document starts (</code> or </pre>)
// Extract everything from X: up to (but not including) &lt;/code&gt; or &lt;/pre&gt;
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=&lt;\/code&gt;|&lt;\/pre&gt;)/);
if (abcMatch) {
let cleanAbc = abcMatch[1].trim();
// Remove any trailing HTML entities
cleanAbc = cleanAbc.replace(/&lt;.*$/, '').trim();
// Validate it's reasonable ABC notation
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) {
// Return clean code block - the processing step will wrap it in abc-notation div
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`;
}
}
// If extraction fails, just remove the corrupted div and return empty code block
// This prevents the corrupted data from being rendered
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`;
}
return match;
});
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
// We do NOT auto-detect ABC notation - it must be explicitly marked
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Skip if already processed or corrupted
if (codeContent.includes('abc-notation') ||
codeContent.includes('class="abc-notation"') ||
codeContent.includes('<div') ||
codeContent.includes('</div>') ||
codeContent.length > 5000) {
return match;
}
// Extract ABC content from the code block
let abcContent = codeContent
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#x27;/g, "'")
.replace(/&#x2F;/g, '/');
// Remove any HTML tags
abcContent = abcContent.replace(/<[^>]+>/g, '').trim();
// Only process if it looks like valid ABC notation (starts with X:)
// Since this is explicitly marked as ABC, we trust it's ABC notation
if (abcContent.match(/^X:\s*\d+/m) &&
abcContent.length < 3000 &&
!abcContent.includes('</') &&
!abcContent.includes('<div') &&
!abcContent.includes('sect') &&
!abcContent.includes('class=')) {
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
const lines = abcContent.split('\n');
const abcLines = [];
for (const line of lines) {
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) {
break;
}
if (line.length > 200) {
break;
}
abcLines.push(line);
if (abcLines.join('\n').length > 2000) {
break;
}
}
const cleanAbc = abcLines.join('\n').trim();
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) {
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`;
}
}
return match;
});
// Process LilyPond notation blocks
const lilypondPattern = /(\\relative[^}]+})/gs;
html = html.replace(lilypondPattern, (match) => {
const lilypondContent = match.trim();
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`;
});
// Process inline chord notation: [C], [Am], [F#m7], etc.
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g;
html = html.replace(chordPattern, (match, chord) => {
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`;
});
// Process MusicXML-like notation
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs;
html = html.replace(musicxmlPattern, (match) => {
const musicxmlContent = match.trim();
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`;
});
return html;
}
/**
* Escapes a string for use in HTML attributes
*/
function escapeForAttr(text) {
return text
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/\n/g, ' ')
.replace(/\r/g, '');
}

152
src/processors/music.ts

@ -1,152 +0,0 @@ @@ -1,152 +0,0 @@
/**
* Processes musical notation in HTML content
* Wraps musical notation in appropriate HTML for rendering
*/
export function processMusicalNotation(html: string): string {
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
// These were created by a buggy regex that matched the entire HTML document
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => {
// This is corrupted - extract just the ABC notation from the beginning
let decoded = dataAbc
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// Find the actual ABC notation (starts with X:)
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|&lt;|<\/|sect|div|pre|code)/);
if (abcMatch) {
const cleanAbc = abcMatch[1].trim();
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`;
}
// If we can't extract clean ABC, remove the div entirely
return content;
});
// Clean up code blocks that contain corrupted abc-notation divs inside them
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i);
if (longDataAbcMatch) {
// Extract just the ABC notation from the beginning of the corrupted data-abc value
let decoded = longDataAbcMatch[1]
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// The ABC notation ends where the HTML document starts (</code> or </pre>)
// Extract everything from X: up to (but not including) &lt;/code&gt; or &lt;/pre&gt;
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=&lt;\/code&gt;|&lt;\/pre&gt;)/);
if (abcMatch) {
let cleanAbc = abcMatch[1].trim();
// Remove any trailing HTML entities
cleanAbc = cleanAbc.replace(/&lt;.*$/, '').trim();
// Validate it's reasonable ABC notation
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) {
// Return clean code block - the processing step will wrap it in abc-notation div
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`;
}
}
// If extraction fails, just remove the corrupted div and return empty code block
// This prevents the corrupted data from being rendered
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`;
}
return match;
});
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
// We do NOT auto-detect ABC notation - it must be explicitly marked
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Skip if already processed or corrupted
if (codeContent.includes('abc-notation') ||
codeContent.includes('class="abc-notation"') ||
codeContent.includes('<div') ||
codeContent.includes('</div>') ||
codeContent.length > 5000) {
return match;
}
// Extract ABC content from the code block
let abcContent = codeContent
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#x27;/g, "'")
.replace(/&#x2F;/g, '/');
// Remove any HTML tags
abcContent = abcContent.replace(/<[^>]+>/g, '').trim();
// Only process if it looks like valid ABC notation (starts with X:)
// Since this is explicitly marked as ABC, we trust it's ABC notation
if (abcContent.match(/^X:\s*\d+/m) &&
abcContent.length < 3000 &&
!abcContent.includes('</') &&
!abcContent.includes('<div') &&
!abcContent.includes('sect') &&
!abcContent.includes('class=')) {
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
const lines = abcContent.split('\n');
const abcLines: string[] = [];
for (const line of lines) {
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) {
break;
}
if (line.length > 200) {
break;
}
abcLines.push(line);
if (abcLines.join('\n').length > 2000) {
break;
}
}
const cleanAbc = abcLines.join('\n').trim();
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) {
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`;
}
}
return match;
});
// Process LilyPond notation blocks
const lilypondPattern = /(\\relative[^}]+})/gs;
html = html.replace(lilypondPattern, (match) => {
const lilypondContent = match.trim();
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`;
});
// Process inline chord notation: [C], [Am], [F#m7], etc.
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g;
html = html.replace(chordPattern, (match, chord) => {
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`;
});
// Process MusicXML-like notation
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs;
html = html.replace(musicxmlPattern, (match) => {
const musicxmlContent = match.trim();
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`;
});
return html;
}
/**
* Escapes a string for use in HTML attributes
*/
function escapeForAttr(text: string): string {
return text
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/\n/g, ' ')
.replace(/\r/g, '');
}

14
src/types.js

@ -1,14 +0,0 @@ @@ -1,14 +0,0 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ContentFormat = void 0;
/**
* Detected content format
*/
var ContentFormat;
(function (ContentFormat) {
ContentFormat["Unknown"] = "unknown";
ContentFormat["AsciiDoc"] = "asciidoc";
ContentFormat["Markdown"] = "markdown";
ContentFormat["Wikipedia"] = "wikipedia";
ContentFormat["Plain"] = "plain";
})(ContentFormat || (exports.ContentFormat = ContentFormat = {}));

15
src/types.ts

@ -72,7 +72,11 @@ export interface ProcessResult { @@ -72,7 +72,11 @@ export interface ProcessResult {
/** Extracted hashtags */
hashtags: string[];
/** Extracted regular links */
links: Array<{ url: string; text: string; isExternal: boolean }>;
links: Array<{
url: string;
text: string;
isExternal: boolean;
}>;
/** Extracted media URLs */
media: string[];
}
@ -81,9 +85,8 @@ export interface ProcessResult { @@ -81,9 +85,8 @@ export interface ProcessResult {
* Detected content format
*/
export enum ContentFormat {
Unknown = 'unknown',
AsciiDoc = 'asciidoc',
Markdown = 'markdown',
Wikipedia = 'wikipedia',
Plain = 'plain'
Unknown = "unknown",
AsciiDoc = "asciidoc",
Markdown = "markdown",
Plain = "plain"
}

20
src/types/asciidoctor.d.ts vendored

@ -1,20 +0,0 @@ @@ -1,20 +0,0 @@
/**
* Type declarations for @asciidoctor/core
* These are minimal types - the actual types should come from the package
*/
declare module '@asciidoctor/core' {
interface ConvertOptions {
safe?: string;
backend?: string;
doctype?: string;
attributes?: Record<string, any>;
extension_registry?: any;
}
interface Asciidoctor {
convert(content: string, options?: ConvertOptions): string | any;
}
function asciidoctor(): Asciidoctor;
export default asciidoctor;
}

732
src/utils/report-generator.ts

@ -1,732 +0,0 @@ @@ -1,732 +0,0 @@
import { Parser } from '../parser';
import * as fs from 'fs';
import * as path from 'path';
import { ProcessResult } from '../types';
/**
* Shared utilities for generating test reports
*/
export interface TestData {
original: string;
result: ProcessResult;
}
export interface ReportData {
markdown: TestData;
asciidoc: TestData;
}
/**
* Generate HTML test report from parsed documents
*/
export function generateHTMLReport(data: ReportData): string {
const { markdown, asciidoc } = data;
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GC Parser Test Report</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.6;
color: #333;
background: #f5f5f5;
padding: 20px;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
color: #2c3e50;
margin-bottom: 10px;
font-size: 2.5em;
}
.subtitle {
color: #7f8c8d;
margin-bottom: 30px;
font-size: 1.1em;
}
.section {
background: white;
border-radius: 8px;
padding: 30px;
margin-bottom: 30px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.section h2 {
color: #34495e;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #3498db;
font-size: 1.8em;
}
.section h3 {
color: #2c3e50;
margin-top: 25px;
margin-bottom: 15px;
font-size: 1.3em;
}
.tabs {
display: flex;
gap: 10px;
margin-bottom: 20px;
border-bottom: 2px solid #e0e0e0;
}
.tab {
padding: 12px 24px;
background: #f8f9fa;
border: none;
border-top-left-radius: 6px;
border-top-right-radius: 6px;
cursor: pointer;
font-size: 1em;
font-weight: 500;
color: #555;
transition: all 0.2s;
}
.tab:hover {
background: #e9ecef;
}
.tab.active {
background: #3498db;
color: white;
}
.tab-content {
display: none;
}
.tab-content.active {
display: block;
}
.metadata-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 15px;
margin-top: 15px;
}
.metadata-item {
background: #f8f9fa;
padding: 12px;
border-radius: 4px;
border-left: 3px solid #3498db;
}
.metadata-item strong {
color: #2c3e50;
display: block;
margin-bottom: 5px;
}
.metadata-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.9em;
}
.code-block {
background: #2d2d2d;
color: #f8f8f2;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.9em;
line-height: 1.5;
margin: 15px 0;
max-height: 400px;
overflow-y: auto;
}
.code-block pre {
margin: 0;
white-space: pre-wrap;
word-wrap: break-word;
}
.rendered-output {
background: white;
border: 1px solid #ddd;
padding: 20px;
border-radius: 6px;
margin: 15px 0;
min-height: 200px;
}
.rendered-output * {
max-width: 100%;
}
.stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 15px;
margin-top: 20px;
}
.stat-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
.stat-card .number {
font-size: 2.5em;
font-weight: bold;
margin-bottom: 5px;
}
.stat-card .label {
font-size: 0.9em;
opacity: 0.9;
}
.list-item {
background: #f8f9fa;
padding: 8px 12px;
margin: 5px 0;
border-radius: 4px;
border-left: 3px solid #95a5a6;
}
.list-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.85em;
}
.success-badge {
display: inline-block;
background: #27ae60;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.warning-badge {
display: inline-block;
background: #f39c12;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.comparison {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
margin-top: 20px;
}
@media (max-width: 768px) {
.comparison {
grid-template-columns: 1fr;
}
}
.json-view {
background: #f8f9fa;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.85em;
max-height: 300px;
overflow-y: auto;
}
</style>
</head>
<body>
<div class="container">
<h1>GC Parser Test Report</h1>
<p class="subtitle">Generated: ${new Date().toLocaleString()}</p>
<!-- Markdown Section -->
<div class="section">
<h2>Markdown Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('md-overview')">Overview</button>
<button class="tab" onclick="showTab('md-original')">Original Content</button>
<button class="tab" onclick="showTab('md-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('md-metadata')">Metadata</button>
</div>
<div id="md-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${markdown.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${markdown.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(markdown.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="md-original" class="tab-content">
<h3>Original Markdown Content</h3>
<div class="code-block">
<pre>${escapeHtml(markdown.original)}</pre>
</div>
</div>
<div id="md-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${cleanHtmlContent(markdown.result.content)}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML (Final Processed State)</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(markdown.result.content)}</pre>
</div>
</details>
</div>
<div id="md-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${markdown.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${markdown.result.nostrLinks.length})</h4>
${markdown.result.nostrLinks.map((link: any) => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${markdown.result.wikilinks.length})</h4>
${markdown.result.wikilinks.map((wl: any) => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.hashtags.length > 0 ? `
<h4>Hashtags (${markdown.result.hashtags.length})</h4>
${markdown.result.hashtags.map((tag: string) => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${markdown.result.links.length > 0 ? `
<h4>Links (${markdown.result.links.length})</h4>
${markdown.result.links.map((link: any) => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${markdown.result.media.length > 0 ? `
<h4>Media URLs (${markdown.result.media.length})</h4>
${markdown.result.media.map((url: string) => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${markdown.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${markdown.result.tableOfContents}
</div>
` : ''}
</div>
</div>
<!-- AsciiDoc Section -->
<div class="section">
<h2>AsciiDoc Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('ad-overview')">Overview</button>
<button class="tab" onclick="showTab('ad-original')">Original Content</button>
<button class="tab" onclick="showTab('ad-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('ad-metadata')">Metadata</button>
</div>
<div id="ad-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${asciidoc.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${asciidoc.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(asciidoc.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="ad-original" class="tab-content">
<h3>Original AsciiDoc Content</h3>
<div class="code-block">
<pre>${escapeHtml(asciidoc.original)}</pre>
</div>
</div>
<div id="ad-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${cleanHtmlContent(asciidoc.result.content)}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML (Final Processed State)</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(asciidoc.result.content)}</pre>
</div>
</details>
</div>
<div id="ad-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${asciidoc.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${asciidoc.result.nostrLinks.length})</h4>
${asciidoc.result.nostrLinks.map((link: any) => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${asciidoc.result.wikilinks.length})</h4>
${asciidoc.result.wikilinks.map((wl: any) => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.hashtags.length > 0 ? `
<h4>Hashtags (${asciidoc.result.hashtags.length})</h4>
${asciidoc.result.hashtags.map((tag: string) => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${asciidoc.result.links.length > 0 ? `
<h4>Links (${asciidoc.result.links.length})</h4>
${asciidoc.result.links.map((link: any) => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.media.length > 0 ? `
<h4>Media URLs (${asciidoc.result.media.length})</h4>
${asciidoc.result.media.map((url: string) => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${asciidoc.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${asciidoc.result.tableOfContents}
</div>
` : ''}
</div>
</div>
</div>
<script>
function showTab(tabId) {
// Hide all tab contents
const allContents = document.querySelectorAll('.tab-content');
allContents.forEach(content => content.classList.remove('active'));
// Remove active class from all tabs
const allTabs = document.querySelectorAll('.tab');
allTabs.forEach(tab => tab.classList.remove('active'));
// Show selected tab content
const selectedContent = document.getElementById(tabId);
if (selectedContent) {
selectedContent.classList.add('active');
}
// Add active class to clicked tab
event.target.classList.add('active');
}
</script>
</body>
</html>`;
}
/**
* Clean HTML content to extract only the body content
* Removes full HTML document structure if present
* Prevents infinite loops by ensuring we only extract once and handle nested structures
* Also detects and prevents content duplication (doom loops)
*/
function cleanHtmlContent(html: string): string {
if (!html || typeof html !== 'string') {
return '';
}
let cleaned = html.trim();
// Count occurrences to detect nested structures
const htmlTagCount = (cleaned.match(/<html[^>]*>/gi) || []).length;
const bodyTagCount = (cleaned.match(/<body[^>]*>/gi) || []).length;
const bodyCloseCount = (cleaned.match(/<\/body>/gi) || []).length;
// If we have multiple body tags, there might be nested structures
// Extract only the outermost body content
if (bodyTagCount > 0 && bodyCloseCount > 0) {
// Find the first <body> tag
const firstBodyIndex = cleaned.indexOf('<body');
if (firstBodyIndex !== -1) {
// Find the opening > of the first body tag
const bodyTagEnd = cleaned.indexOf('>', firstBodyIndex);
if (bodyTagEnd !== -1) {
const bodyStart = bodyTagEnd + 1;
// Find the last </body> tag (to handle nested structures)
const bodyEnd = cleaned.lastIndexOf('</body>');
if (bodyEnd > bodyStart) {
cleaned = cleaned.substring(bodyStart, bodyEnd).trim();
// Recursively clean if there are still nested structures
// But limit recursion to prevent infinite loops
const remainingBodyTags = (cleaned.match(/<body[^>]*>/gi) || []).length;
if (remainingBodyTags > 0 && remainingBodyTags < bodyTagCount) {
// There are still nested body tags, clean again but only once more
cleaned = cleaned.replace(/<body[^>]*>/gi, '');
cleaned = cleaned.replace(/<\/body>/gi, '');
}
}
}
}
}
// Remove any remaining DOCTYPE, html, head, or body tags that might be left
// Do this in a way that doesn't create nested matches
let previousLength = 0;
let iterations = 0;
while (iterations < 10 && cleaned.length !== previousLength) {
previousLength = cleaned.length;
cleaned = cleaned.replace(/<!DOCTYPE[^>]*>/gi, '');
cleaned = cleaned.replace(/<html[^>]*>/gi, '');
cleaned = cleaned.replace(/<\/html>/gi, '');
cleaned = cleaned.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '');
cleaned = cleaned.replace(/<body[^>]*>/gi, '');
cleaned = cleaned.replace(/<\/body>/gi, '');
cleaned = cleaned.trim();
iterations++;
}
// Detect and prevent content duplication (doom loops)
// Strategy: Use a fingerprint of the first part of the content to detect repetition
// Create a fingerprint from the first meaningful chunk (skip leading whitespace/tags)
const contentStart = cleaned.search(/[^\s<]/);
if (contentStart !== -1) {
// Use first 2000 characters as fingerprint, or 1/4 of content, whichever is smaller
const fingerprintLength = Math.min(2000, Math.max(500, Math.floor(cleaned.length / 4)));
const fingerprint = cleaned.substring(contentStart, contentStart + fingerprintLength);
// Find where this fingerprint repeats
const secondOccurrence = cleaned.indexOf(fingerprint, contentStart + fingerprintLength);
if (secondOccurrence !== -1 && secondOccurrence < cleaned.length * 0.85) {
// Content is clearly duplicated - return only the first occurrence
cleaned = cleaned.substring(0, secondOccurrence).trim();
return cleaned;
}
}
// Additional check: detect repeated patterns using common document markers
const documentMarkers = [
/#\s+Markdown\s+Test\s+Document/gi,
/==\s+Bullet\s+list/gi,
/##\s+Bullet\s+list/gi,
];
for (const marker of documentMarkers) {
const matches = cleaned.match(marker);
if (matches && matches.length > 1) {
const firstMatch = cleaned.search(marker);
if (firstMatch !== -1) {
// Get a chunk starting from this marker
const chunkStart = firstMatch;
const chunkLength = Math.min(1500, Math.floor(cleaned.length / 3));
const chunk = cleaned.substring(chunkStart, chunkStart + chunkLength);
// Find where this chunk repeats
const secondChunk = cleaned.indexOf(chunk, chunkStart + chunkLength);
if (secondChunk !== -1 && secondChunk < cleaned.length * 0.9) {
// Content repeats here - truncate
cleaned = cleaned.substring(0, secondChunk).trim();
return cleaned;
}
}
}
}
// Final check: detect repeated section headers
const sectionHeaderPattern = /(?:^|\n)(?:##?|==)\s+[^\n<]+/gm;
const sectionHeaders: string[] = [];
let match;
while ((match = sectionHeaderPattern.exec(cleaned)) !== null) {
sectionHeaders.push(match[0].trim());
}
// If we have many headers, check for repetition
if (sectionHeaders.length > 8) {
const uniqueHeaders = new Set(sectionHeaders);
// If we have way more headers than unique ones, content is repeating
if (sectionHeaders.length > uniqueHeaders.size * 2.5) {
// Find the first occurrence of each unique header
const uniqueHeaderArray = Array.from(uniqueHeaders);
const firstUniqueHeader = uniqueHeaderArray[0];
const firstHeaderIndex = cleaned.indexOf(firstUniqueHeader);
if (firstHeaderIndex !== -1) {
// Find the second occurrence of the first header
const secondHeaderIndex = cleaned.indexOf(firstUniqueHeader, firstHeaderIndex + 200);
if (secondHeaderIndex !== -1 && secondHeaderIndex < cleaned.length * 0.85) {
// Content repeats here - truncate
cleaned = cleaned.substring(0, secondHeaderIndex).trim();
}
}
}
}
return cleaned;
}
/**
* Escape HTML special characters
*/
export function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;',
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

427
test-parser-report.test.ts

@ -1,427 +0,0 @@ @@ -1,427 +0,0 @@
import { Parser } from './src/parser';
import { generateHTMLReport, escapeHtml } from './src/utils/report-generator';
import * as fs from 'fs';
import * as path from 'path';
/**
* Test that parses both markdown and asciidoc test documents
* and generates an HTML report showing the parsing results
*/
describe('Parser Test Report', () => {
const parser = new Parser({
linkBaseURL: 'https://example.com',
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/notes?t={topic}',
});
test('Generate HTML test report for markdown and asciidoc documents', async () => {
// Read test documents
const markdownContent = fs.readFileSync(
path.join(__dirname, 'markdown_testdoc.md'),
'utf-8'
);
const asciidocContent = fs.readFileSync(
path.join(__dirname, 'asciidoc_testdoc.adoc'),
'utf-8'
);
// Parse both documents
const markdownResult = await parser.process(markdownContent);
const asciidocResult = await parser.process(asciidocContent);
// Generate HTML report
const htmlReport = generateHTMLReport({
markdown: {
original: markdownContent,
result: markdownResult,
},
asciidoc: {
original: asciidocContent,
result: asciidocResult,
},
});
// Write HTML report to file
const reportPath = path.join(__dirname, 'test-report.html');
fs.writeFileSync(reportPath, htmlReport, 'utf-8');
console.log(`\n✅ Test report generated: ${reportPath}`);
console.log(` Open this file in your browser to view the results.\n`);
// ============================================
// Basic assertions to ensure parsing worked
// ============================================
expect(markdownResult.content).toBeTruthy();
expect(asciidocResult.content).toBeTruthy();
expect(markdownResult.content.length).toBeGreaterThan(0);
expect(asciidocResult.content.length).toBeGreaterThan(0);
// ============================================
// Test HTML Report Structure
// ============================================
expect(htmlReport).toContain('GC Parser Test Report');
expect(htmlReport).toContain('Markdown Document Test');
expect(htmlReport).toContain('AsciiDoc Document Test');
expect(htmlReport).toContain('class="tabs"');
expect(htmlReport).toContain('class="tab-content"');
// ============================================
// Test Markdown Rendering
// ============================================
const markdownHtml = markdownResult.content;
// Check if AsciiDoctor successfully converted the content to HTML
// If it failed, the content will be plain text with AsciiDoc macros or just wrapped in <p>
// Real HTML will have multiple HTML elements, not just a single <p> wrapper
const isHtmlRendered = markdownHtml.includes('<a') ||
markdownHtml.includes('<img') ||
markdownHtml.includes('<div class') ||
(markdownHtml.includes('<h') && markdownHtml.includes('</h')) ||
(markdownHtml.includes('<ul') || markdownHtml.includes('<ol'));
if (isHtmlRendered) {
// Test that links are rendered as <a> tags (not escaped HTML)
expect(markdownHtml).toMatch(/<a\s+href=["']https?:\/\/[^"']+["'][^>]*>/i);
expect(markdownHtml).not.toContain('&lt;a href='); // Should not be escaped HTML
expect(markdownHtml).not.toContain('href="&quot;'); // Should not have double-escaped quotes
// Test wss:// URL rendering - should be a clickable link, not OpenGraph
expect(markdownHtml).toMatch(/<a\s+href=["']https:\/\/theforest\.nostr1\.com[^"']*["'][^>]*>wss:\/\/theforest\.nostr1\.com/i);
// Should NOT be wrapped in opengraph-link-container
const wssLinkMatch = markdownHtml.match(/<a[^>]*href=["']https:\/\/theforest\.nostr1\.com[^"']*["'][^>]*>wss:\/\/theforest\.nostr1\.com/i);
if (wssLinkMatch) {
const linkHtml = wssLinkMatch[0];
expect(linkHtml).not.toContain('opengraph-link-container');
expect(linkHtml).not.toContain('opengraph-link');
}
// Test that www.example.com is rendered as a link (not plaintext after "hyperlink:")
expect(markdownHtml).toMatch(/<a\s+href=["']https:\/\/www\.example\.com[^"']*["'][^>]*>www\.example\.com/i);
// Test images are rendered
expect(markdownHtml).toMatch(/<img[^>]+src=["']https:\/\/blog\.ronin\.cloud[^"']+["'][^>]*>/i);
// Test media embeds
expect(markdownHtml).toContain('youtube-embed');
expect(markdownHtml).toContain('spotify-embed');
expect(markdownHtml).toContain('video-embed');
expect(markdownHtml).toContain('audio-embed');
// Test nostr links are rendered
expect(markdownHtml).toMatch(/class=["'][^"']*nostr-link[^"']*["']/i);
// Test wikilinks are rendered
expect(markdownHtml).toMatch(/class=["'][^"']*wikilink[^"']*["']/i);
// Test hashtags are rendered
expect(markdownHtml).toMatch(/class=["'][^"']*hashtag-link[^"']*["']/i);
} else {
// AsciiDoctor failed - content is plain text with AsciiDoc macros
// This is expected in Jest due to Opal runtime issues
// Just verify the content exists and contains expected text
expect(markdownHtml).toContain('Markdown Test Document');
expect(markdownHtml).toContain('Media and Links');
console.warn('⚠ AsciiDoctor conversion failed in Jest - skipping HTML rendering tests');
}
// Test frontmatter is extracted
expect(markdownResult.frontmatter).toBeTruthy();
expect(markdownResult.frontmatter?.author).toBe('James Smith');
// ============================================
// Test Metadata Extraction
// ============================================
// Nostr links should be extracted
expect(markdownResult.nostrLinks.length).toBeGreaterThan(0);
const hasNaddr = markdownResult.nostrLinks.some(link => link.type === 'naddr');
const hasNpub = markdownResult.nostrLinks.some(link => link.type === 'npub');
const hasNevent = markdownResult.nostrLinks.some(link => link.type === 'nevent');
expect(hasNaddr || hasNpub || hasNevent).toBe(true);
// Wikilinks should be extracted
expect(markdownResult.wikilinks.length).toBeGreaterThan(0);
const hasWikilink = markdownResult.wikilinks.some(wl =>
wl.dtag === 'nkbip-01' || wl.dtag === 'mirepoix'
);
expect(hasWikilink).toBe(true);
// Hashtags should be extracted
expect(markdownResult.hashtags.length).toBeGreaterThan(0);
const hasTestHashtag = markdownResult.hashtags.some(tag =>
tag.toLowerCase() === 'testhashtag' || tag.toLowerCase() === 'inlinehashtag'
);
expect(hasTestHashtag).toBe(true);
// Links should be extracted
expect(markdownResult.links.length).toBeGreaterThan(0);
// Test that nested image links are handled correctly
// [![alt](image-url)](link-url) should extract the outer link with cleaned text
// The link should point to the actual destination (youtube, spotify, etc.), not the image URL
const nestedImageLink = markdownResult.links.find(link =>
(link.url.includes('youtube.com/shorts') || link.url.includes('youtu.be')) ||
link.url.includes('spotify.com') ||
link.url.includes('v.nostr.build') ||
link.url.includes('media.blubrry.com')
);
if (nestedImageLink) {
// The text should NOT contain markdown image syntax
expect(nestedImageLink.text).not.toContain('![');
expect(nestedImageLink.text).not.toContain('](');
// The text should be clean (just the alt text, e.g., "Youtube link with pic")
expect(nestedImageLink.text.length).toBeGreaterThan(0);
// The URL should be the actual destination, not the image URL
expect(nestedImageLink.url).not.toContain('upload.wikimedia.org');
expect(nestedImageLink.url).not.toMatch(/\.(png|jpg|jpeg|svg|gif|webp)$/i);
}
// Test that image URLs from nested links are NOT extracted as regular links
// The inner image URLs (like upload.wikimedia.org) should not be in the links array
// Only the outer link URLs (youtube, spotify, etc.) should be extracted
const imageUrlLinks = markdownResult.links.filter(link =>
link.url.includes('upload.wikimedia.org')
);
// These should not exist - nested image links should only extract the outer link
expect(imageUrlLinks.length).toBe(0);
// Also verify that no link text contains image markdown syntax
markdownResult.links.forEach(link => {
expect(link.text).not.toContain('![');
expect(link.text).not.toContain('](');
});
// Media should be extracted (if present in content)
// Note: Media extraction might depend on the content format and processing
if (markdownResult.media.length > 0) {
const hasYouTube = markdownResult.media.some(url => url.includes('youtube.com') || url.includes('youtu.be'));
const hasSpotify = markdownResult.media.some(url => url.includes('spotify.com'));
const hasAudio = markdownResult.media.some(url => url.includes('.mp3') || url.includes('audio'));
const hasVideo = markdownResult.media.some(url => url.includes('.mp4') || url.includes('video'));
expect(hasYouTube || hasSpotify || hasAudio || hasVideo).toBe(true);
} else {
// Media extraction might not work if AsciiDoctor failed
console.warn('⚠ No media extracted - this may be expected if AsciiDoctor conversion failed');
}
// ============================================
// Test HTML Report Content
// ============================================
// Test that metadata counts are displayed in the report
expect(htmlReport).toMatch(new RegExp(`<div class="number">${markdownResult.nostrLinks.length}</div>`));
expect(htmlReport).toMatch(new RegExp(`<div class="number">${markdownResult.wikilinks.length}</div>`));
expect(htmlReport).toMatch(new RegExp(`<div class="number">${markdownResult.hashtags.length}</div>`));
expect(htmlReport).toMatch(new RegExp(`<div class="number">${markdownResult.links.length}</div>`));
expect(htmlReport).toMatch(new RegExp(`<div class="number">${markdownResult.media.length}</div>`));
// Test that frontmatter is displayed
if (markdownResult.frontmatter) {
expect(htmlReport).toContain('James Smith');
expect(htmlReport).toContain('This is a summary');
}
// Test that rendered HTML is included (not escaped)
// Note: content may be cleaned to remove duplicates, so check for a significant portion
// The raw HTML section should contain the full content (escaped)
const cleanedMarkdown = markdownResult.content.substring(0, Math.min(1000, markdownResult.content.length));
const cleanedAsciidoc = asciidocResult.content.substring(0, Math.min(1000, asciidocResult.content.length));
expect(htmlReport).toContain(cleanedMarkdown);
expect(htmlReport).toContain(cleanedAsciidoc);
// Also verify the raw HTML section contains the full content (escaped)
expect(htmlReport).toContain(escapeHtml(markdownResult.content.substring(0, 500)));
expect(htmlReport).toContain(escapeHtml(asciidocResult.content.substring(0, 500)));
// Test that original content is displayed
expect(htmlReport).toContain('Markdown Test Document');
expect(htmlReport).toContain('Media and Links');
// ============================================
// Test AsciiDoc Rendering
// ============================================
const asciidocHtml = asciidocResult.content;
expect(asciidocHtml.length).toBeGreaterThan(0);
// AsciiDoc should have table of contents
if (asciidocResult.tableOfContents) {
expect(asciidocResult.tableOfContents.length).toBeGreaterThan(0);
}
// ============================================
// Test Specific Edge Cases
// ============================================
if (isHtmlRendered) {
// Test that URLs with query parameters are not broken
const weltUrl = 'https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html';
expect(markdownHtml).toContain(weltUrl);
// Test that code blocks are preserved (URLs in code should not be links)
// The text "this should render as plaintext: `http://www.example.com`" should have the URL in a code tag
expect(markdownHtml).toMatch(/<code[^>]*>http:\/\/www\.example\.com<\/code>/i);
} else {
// If AsciiDoctor failed, just verify the URL is in the content somewhere
const weltUrl = 'https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html';
expect(markdownHtml).toContain(weltUrl);
}
// Test that LaTeX is detected if present
if (markdownResult.hasLaTeX) {
expect(htmlReport).toMatch(/<div class="number">Yes<\/div>.*Has LaTeX/i);
}
// Test that musical notation is detected if present
if (markdownResult.hasMusicalNotation) {
expect(htmlReport).toMatch(/<div class="number">Yes<\/div>.*Has Music/i);
}
// ============================================
// Test for Content Repetition (Doom Loop Fix)
// ============================================
// Extract rendered output sections from the HTML report
// Use a function that properly handles nested divs
function extractRenderedOutputs(html: string): string[] {
const outputs: string[] = [];
const startPattern = /<div class="rendered-output">/gi;
let startMatch;
while ((startMatch = startPattern.exec(html)) !== null) {
const startIndex = startMatch.index + startMatch[0].length;
let depth = 1;
let currentIndex = startIndex;
// Find the matching closing div by counting nested divs
while (depth > 0 && currentIndex < html.length) {
const nextOpen = html.indexOf('<div', currentIndex);
const nextClose = html.indexOf('</div>', currentIndex);
if (nextClose === -1) break; // No more closing tags
if (nextOpen !== -1 && nextOpen < nextClose) {
// Found an opening div before the closing one
depth++;
currentIndex = nextOpen + 4;
} else {
// Found a closing div
depth--;
if (depth === 0) {
// Found the matching closing div
outputs.push(html.substring(startIndex, nextClose).trim());
break;
}
currentIndex = nextClose + 6;
}
}
}
return outputs;
}
const renderedOutputs = extractRenderedOutputs(htmlReport);
// Test that we have rendered output sections
expect(renderedOutputs.length).toBeGreaterThan(0);
// Test each rendered output section for content repetition
renderedOutputs.forEach((output, index) => {
// Check for specific content that should only appear once
const testPhrases = [
'# Markdown Test Document',
'## Bullet list',
'This is a test unordered list with mixed bullets:',
'## Headers',
'## Media and Links',
'### Nostr address',
'## Tables',
'## Code blocks',
'## LateX',
];
testPhrases.forEach(phrase => {
// Count occurrences of the phrase in this output section
const occurrences = (output.match(new RegExp(escapeRegex(phrase), 'gi')) || []).length;
// Each phrase should appear at most once (or a few times if it's in different contexts)
// But if it appears many times, that indicates a repetition loop
if (occurrences > 5) {
throw new Error(
`Content repetition detected in rendered output section ${index + 1}: ` +
`"${phrase}" appears ${occurrences} times (expected ≤5). ` +
`This indicates a doom-loop in content generation.`
);
}
});
// Check for duplicate document structure
// If the entire document structure repeats, we'll see multiple instances of key sections
const sectionHeaders = output.match(/##\s+[^\n]+/g) || [];
const uniqueHeaders = new Set(sectionHeaders.map(h => h.trim()));
// If we have many more headers than unique ones, content is repeating
if (sectionHeaders.length > uniqueHeaders.size * 2) {
throw new Error(
`Content repetition detected in rendered output section ${index + 1}: ` +
`Found ${sectionHeaders.length} section headers but only ${uniqueHeaders.size} unique ones. ` +
`This indicates the entire document is repeating.`
);
}
// Check for repeated code block placeholders (should only appear once per code block)
const codeBlockPlaceholders: string[] = (output.match(/__CODEBLOCK_\d+__/g) || []);
const uniquePlaceholders = new Set(codeBlockPlaceholders);
// Each placeholder should appear only once
if (codeBlockPlaceholders.length !== uniquePlaceholders.size) {
const duplicates = codeBlockPlaceholders.filter((p, i) => codeBlockPlaceholders.indexOf(p) !== i);
throw new Error(
`Content repetition detected in rendered output section ${index + 1}: ` +
`Found duplicate code block placeholders: ${Array.from(new Set(duplicates)).join(', ')}. ` +
`Each placeholder should appear only once.`
);
}
// Check overall content length - if it's unreasonably long, content might be repeating
// A typical test document should be under 50KB in the rendered output
if (output.length > 100000) {
console.warn(
` Rendered output section ${index + 1} is very long (${output.length} chars). ` +
`This might indicate content repetition.`
);
}
});
// Test that the markdown content appears only once in the markdown rendered section
const markdownRenderedMatch = htmlReport.match(
/<div id="md-rendered"[\s\S]*?<div class="rendered-output">([\s\S]*?)<\/div>/
);
if (markdownRenderedMatch) {
const markdownRendered = markdownRenderedMatch[1];
// Count how many times the document title appears
const titleCount = (markdownRendered.match(/# Markdown Test Document/gi) || []).length;
expect(titleCount).toBeLessThanOrEqual(1);
// Count how many times a unique section appears
const uniqueSection = 'Ordered list that is wrongly numbered:';
const uniqueSectionCount = (markdownRendered.match(new RegExp(escapeRegex(uniqueSection), 'gi')) || []).length;
expect(uniqueSectionCount).toBeLessThanOrEqual(1);
}
// Test that the asciidoc content appears only once in the asciidoc rendered section
const asciidocRenderedMatch = htmlReport.match(
/<div id="ad-rendered"[\s\S]*?<div class="rendered-output">([\s\S]*?)<\/div>/
);
if (asciidocRenderedMatch) {
const asciidocRendered = asciidocRenderedMatch[1];
// Count how many times the document title appears
const titleCount = (asciidocRendered.match(/== Bullet list/gi) || []).length;
expect(titleCount).toBeLessThanOrEqual(1);
}
console.log('✅ Content repetition check passed - no doom-loop detected');
});
});
/**
* Escape special regex characters in a string
*/
function escapeRegex(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

10144
test-report.html

File diff suppressed because it is too large Load Diff

4
tsconfig.json

@ -3,8 +3,8 @@ @@ -3,8 +3,8 @@
"target": "ES2020",
"module": "commonjs",
"lib": ["ES2020"],
"types": ["node", "jest"],
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
@ -14,6 +14,6 @@ @@ -14,6 +14,6 @@
"sourceMap": true,
"resolveJsonModule": true
},
"include": ["src/**/*", "generate-test-report.ts"],
"include": ["src/**/*"],
"exclude": ["node_modules", "dist", "**/*.test.ts"]
}

10
tsconfig.test.json

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
{
"extends": "./tsconfig.json",
"compilerOptions": {
"rootDir": ".",
"types": ["node", "jest"],
"noEmit": true
},
"include": ["src/**/*", "**/*.test.ts", "generate-test-report.ts"],
"exclude": ["node_modules", "dist"]
}
Loading…
Cancel
Save