Browse Source

add unit tests

master
Silberengel 2 weeks ago
parent
commit
5fcfc11dfb
  1. 370
      asciidoc_testdoc.adoc
  2. 72
      debug-asciidoc-output.adoc
  3. 27
      debug-lists.ts
  4. 642
      generate-test-report.js
  5. 642
      generate-test-report.ts
  6. 25
      jest.config.js
  7. 337
      markdown_testdoc.md
  8. 11
      package.json
  9. 692
      src/converters/to-asciidoc.js
  10. 423
      src/converters/to-asciidoc.ts
  11. 70
      src/detector.js
  12. 160
      src/extractors/frontmatter.js
  13. 177
      src/extractors/frontmatter.ts
  14. 243
      src/extractors/metadata.js
  15. 92
      src/parser.js
  16. 25
      src/parser.ts
  17. 148
      src/processors/asciidoc.js
  18. 7
      src/processors/asciidoc.ts
  19. 594
      src/processors/html-postprocess.js
  20. 326
      src/processors/html-postprocess.ts
  21. 239
      src/processors/html-utils.js
  22. 33
      src/processors/html-utils.ts
  23. 143
      src/processors/music.js
  24. 115
      src/processors/music.ts
  25. 14
      src/types.js
  26. 16
      src/types.ts
  27. 628
      test-parser-report.test.ts
  28. 13415
      test-report.html

370
asciidoc_testdoc.adoc

@ -0,0 +1,370 @@ @@ -0,0 +1,370 @@
= AsciiDoc Test Document
Kismet Lee
2.9, October 31, 2021: Fall incarnation
:description: Test description
:author: Kismet Lee
:date: 2021-10-31
:version: 2.9
:status: Draft
:keywords: AsciiDoc, Test, Document
:category: Test
:language: English
== Bullet list
This is a test unordered list with mixed bullets:
* First item with a number 2. in it
* Second item
* Third item
** Indented item
** Indented item
* Fourth item
Another unordered list:
* 1st item
* 2nd item
* third item containing _italic_ text
** indented item
** second indented item
* fourth item
This is a test ordered list with indented items:
. First item
. Second item
. Third item
.. Indented item
.. Indented item
. Fourth item
Ordered list where everything has no number:
. First item
. Second item
. Third item
. Fourth item
This is a mixed list with indented items:
. First item
. Second item
. Third item
* Indented item
* Indented item
. Fourth item
This is another mixed list with indented items:
* First item
* Second item
* Third item
. Indented item
. Indented item
* Fourth item
== Headers
=== Third-level header
==== Fourth-level header
===== Fifth-level header
====== Sixth-level header
== Media and Links
=== Nostr address
This should be ignored and rendered as plaintext: naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
This is also plaintext:
npub1gv069u6q7zkl393ad47xutpqmyfj0rrfrlnqnlfc2ld38k8nnl4st9wa6q
These should be turned into links:
nostr:naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
nostr:npub1l5sga6xg72phsz5422ykujprejwud075ggrr3z2hwyrfgr7eylqstegx9z
nostr:nevent1qvzqqqqqqypzp382htsmu08k277ps40wqhnfm60st89h5pvjyutghq9cjasuh38qqythwumn8ghj7un9d3shjtnswf5k6ctv9ehx2ap0qqsysletg3lqnl4uy59xsj4rp9rgw67wg23l827f4uvn5ckn20fuxcq45d8pj
nostr:nprofile1qqsxhedgkuneycxpcdjlg6tgtxdy8gurdz64nq2h0flc288a0jag98qguy3nh
nostr:note1txyefcha2xt3pgungx4k6j077dsteyef6hzpyuuku00s4h0eymzq4k33yg
=== Hashtag
#testhashtag at the start of the line and #inlinehashtag in the middle
=== Wikilinks
[[NKBIP-01|Specification]] and [[mirepoix]]
=== URL
https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html
link:https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html[Welt Online link]
this should render as plaintext: `http://www.example.com`
this should be a hyperlink: www.example.com
this should be a hyperlink to the http URL with the same address, so wss://theforest.nostr1.com should render like link:wss://theforest.nostr1.com[https://theforest.nostr1.com]
=== Images
https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
image::https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png[test image, width=100%]
=== Media
==== YouTube
https://youtube.com/shorts/ZWfvChb-i0w
link:https://youtube.com/shorts/ZWfvChb-i0w[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Youtube link with pic]]
==== Spotify
https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ
link:https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Spotify link with pic]]
==== Audio
https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3
link:https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Audio link with pic]]
==== Video
https://v.nostr.build/MTjaYib4upQuf8zn.mp4
link:https://v.nostr.build/MTjaYib4upQuf8zn.mp4[image:https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png[Video link with pic]]
== Tables
=== Orderly
[cols="1,2"]
|===
|Syntax|Description
|Header|Title
|Paragraph|Text
|===
=== Unorderly
[cols="1,2"]
|===
|Syntax|Description
|Header|Title
|Paragraph|Text
|===
=== With alignment
[cols="<,^,>"]
|===
|Syntax|Description|Test Text
|Header|Title|Here's this
|Paragraph|Text|And more
|===
== Code blocks
=== json
[source,json]
----
{
"id": "<event_id>",
"pubkey": "<event_originator_pubkey>",
"created_at": 1725087283,
"kind": 30040,
"tags": [
["d", "aesop's-fables-by-aesop"],
["title", "Aesop's Fables"],
["author", "Aesop"],
],
"sig": "<event_signature>"
}
----
=== typescript
[source,typescript]
----
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
----
=== shell
[source,shell]
----
mkdir new_directory
cp source.txt destination.txt
----
=== LaTeX
[source,latex]
----
$$
M =
\begin{bmatrix}
\frac{5}{6} & \frac{1}{6} & 0 \\[0.3em]
\frac{5}{6} & 0 & \frac{1}{6} \\[0.3em]
0 & \frac{5}{6} & \frac{1}{6}
\end{bmatrix}
$$
----
[source,latex]
----
$$
f(x)=
\begin{cases}
1/d_{ij} & \quad \text{when $d_{ij} \leq 160$}\\
0 & \quad \text{otherwise}
\end{cases}
$$
----
=== ABC Notation
[source,abc]
----
X:1
T:Ohne Titel
C:Aufgezeichnet 1784
A:Seibis nahe Lichtenberg in Oberfranken
S:Handschrift, bezeichnet und datiert: "Heinrich Nicol Philipp zu Seibis den 30 Junius 1784"
M:4/4
L:1/4
K:D
dd d2 | ee e2 | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
|:\
fg ad | cB cA | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
----
=== PlantUML
[source,plantuml]
----
@startuml
Alice -> Bob: Authentication Request
Bob --> Alice: Authentication Response
@enduml
----
=== BPMN
[source,plantuml]
----
@startbpmn
start
:Task 1;
:Task 2;
stop
@endbpmn
----
== LaTeX
=== LaTeX in inline-code
`$[ x^n + y^n = z^n \]$` and `$[\sqrt{x^2+1}\]$` and `$\color{blue}{X \sim Normal \; (\mu,\sigma^2)}$`
== LaTeX outside of code
This is a latex code block $$\mathbb{N} = \{ a \in \mathbb{Z} : a > 0 \}$$ and another that is an inline latex $\color{green}{X \sim Normal \; (\mu,\sigma^2)}$ and should be green
== Footnotes
Here's a simple footnote,footnote:[This is the first footnote.] and here's a longer one.footnote:[Here's one with multiple paragraphs and code.]
== Anchor links
<<bullet-list,Link to bullet list section>>
== Formatting
=== Strikethrough
[line-through]#The world is flat.# We now know that the world is round. This should not be ~struck~ through.
=== Bold
This is *bold* text. So is this *bold* text.
=== Italic
This is _italic_ text. So is this _italic_ text.
=== Task List
* [x] Write the press release
* [ ] Update the website
* [ ] Contact the media
=== Emoji shortcodes
Gone camping! :tent: Be back soon.
That is so funny! :joy:
=== Marking and highlighting text
I need to highlight these [highlight]#very important words#.
=== Subscript and Superscript
H~2~O
X^2^
=== Delimiter
based upon a -
'''
based upon a *
'''
=== Quotes
[quote]
____
This is a single line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
____
[quote]
____
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
____

72
debug-asciidoc-output.adoc

@ -0,0 +1,72 @@ @@ -0,0 +1,72 @@
This is a test unordered list with mixed bullets:
* First item with a number 2. in it
* Second item
* Third item
* Indented item
* Indented item
* Fourth item
Another unordered list:
* 1st item
* 2nd item
* third item containing _italic_ text
* indented item
* second indented item
* fourth item
This is a test ordered list with indented items:
. First item
. Second item
. Third item
. Indented item
. Indented item
. Fourth item
Ordered list where everything has the same number:
. First item
. Second item
. Third item
. Fourth item
Ordered list that is wrongly numbered:
. First item
. Second item
. Third item
. Fourth item
This is a mixed list with indented items:
. First item
. Second item
. Third item
* Indented item
* Indented item
. Fourth item
This is another mixed list with indented items:
* First item
* Second item
* Third item
. Indented item
. Indented item
* Fourth item

27
debug-lists.ts

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
import { convertToAsciidoc } from './src/converters/to-asciidoc';
import { detectFormat } from './src/detector';
import * as fs from 'fs';
import * as path from 'path';
// Read just the list section from markdown test doc
const markdownContent = fs.readFileSync(
path.join(__dirname, 'markdown_testdoc.md'),
'utf-8'
);
// Extract just the list sections
const listSection = markdownContent.split('## Bullet list')[1]?.split('##')[0] || markdownContent;
console.log('=== ORIGINAL MARKDOWN ===');
console.log(listSection);
console.log('\n=== DETECTED FORMAT ===');
const format = detectFormat(listSection);
console.log(format);
console.log('\n=== CONVERTED ASCIIDOC ===');
const asciidoc = convertToAsciidoc(listSection, format, '', {});
console.log(asciidoc);
// Write to file for inspection
fs.writeFileSync(path.join(__dirname, 'debug-asciidoc-output.adoc'), asciidoc);
console.log('\n=== Written to debug-asciidoc-output.adoc ===');

642
generate-test-report.js

@ -0,0 +1,642 @@ @@ -0,0 +1,642 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const parser_1 = require("./src/parser");
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
async function main() {
const parser = new parser_1.Parser({
linkBaseURL: 'https://example.com',
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/notes?t={topic}',
});
console.log('Reading test documents...');
// Read test documents
const markdownContent = fs.readFileSync(path.join(__dirname, 'markdown_testdoc.md'), 'utf-8');
const asciidocContent = fs.readFileSync(path.join(__dirname, 'asciidoc_testdoc.adoc'), 'utf-8');
console.log('Parsing markdown document...');
const markdownResult = await parser.process(markdownContent);
console.log('Parsing asciidoc document...');
const asciidocResult = await parser.process(asciidocContent);
console.log('Generating HTML report...');
// Generate HTML report
const htmlReport = generateHTMLReport({
markdown: {
original: markdownContent,
result: markdownResult,
},
asciidoc: {
original: asciidocContent,
result: asciidocResult,
},
});
// Write HTML report to file (force fresh write)
const reportPath = path.join(__dirname, 'test-report.html');
// Delete old report if it exists to ensure fresh generation
if (fs.existsSync(reportPath)) {
fs.unlinkSync(reportPath);
}
fs.writeFileSync(reportPath, htmlReport, 'utf-8');
const reportUrl = `file://${reportPath}`;
console.log(`\n✅ Test report generated successfully!`);
console.log(` File: ${reportPath}`);
console.log(` Size: ${(htmlReport.length / 1024).toFixed(2)} KB`);
console.log(` Timestamp: ${new Date().toISOString()}`);
console.log(` Open this file in your browser to view the results.\n`);
}
function generateHTMLReport(data) {
const { markdown, asciidoc } = data;
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GC Parser Test Report</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.6;
color: #333;
background: #f5f5f5;
padding: 20px;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
color: #2c3e50;
margin-bottom: 10px;
font-size: 2.5em;
}
.subtitle {
color: #7f8c8d;
margin-bottom: 30px;
font-size: 1.1em;
}
.section {
background: white;
border-radius: 8px;
padding: 30px;
margin-bottom: 30px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.section h2 {
color: #34495e;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #3498db;
font-size: 1.8em;
}
.section h3 {
color: #2c3e50;
margin-top: 25px;
margin-bottom: 15px;
font-size: 1.3em;
}
.tabs {
display: flex;
gap: 10px;
margin-bottom: 20px;
border-bottom: 2px solid #e0e0e0;
}
.tab {
padding: 12px 24px;
background: #f8f9fa;
border: none;
border-top-left-radius: 6px;
border-top-right-radius: 6px;
cursor: pointer;
font-size: 1em;
font-weight: 500;
color: #555;
transition: all 0.2s;
}
.tab:hover {
background: #e9ecef;
}
.tab.active {
background: #3498db;
color: white;
}
.tab-content {
display: none;
}
.tab-content.active {
display: block;
}
.metadata-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 15px;
margin-top: 15px;
}
.metadata-item {
background: #f8f9fa;
padding: 12px;
border-radius: 4px;
border-left: 3px solid #3498db;
}
.metadata-item strong {
color: #2c3e50;
display: block;
margin-bottom: 5px;
}
.metadata-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.9em;
}
.code-block {
background: #2d2d2d;
color: #f8f8f2;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.9em;
line-height: 1.5;
margin: 15px 0;
max-height: 400px;
overflow-y: auto;
}
.code-block pre {
margin: 0;
white-space: pre-wrap;
word-wrap: break-word;
}
.rendered-output {
background: white;
border: 1px solid #ddd;
padding: 20px;
border-radius: 6px;
margin: 15px 0;
min-height: 200px;
}
.rendered-output * {
max-width: 100%;
}
.stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 15px;
margin-top: 20px;
}
.stat-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
.stat-card .number {
font-size: 2.5em;
font-weight: bold;
margin-bottom: 5px;
}
.stat-card .label {
font-size: 0.9em;
opacity: 0.9;
}
.list-item {
background: #f8f9fa;
padding: 8px 12px;
margin: 5px 0;
border-radius: 4px;
border-left: 3px solid #95a5a6;
}
.list-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.85em;
}
.success-badge {
display: inline-block;
background: #27ae60;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.warning-badge {
display: inline-block;
background: #f39c12;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.comparison {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
margin-top: 20px;
}
@media (max-width: 768px) {
.comparison {
grid-template-columns: 1fr;
}
}
.json-view {
background: #f8f9fa;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.85em;
max-height: 300px;
overflow-y: auto;
}
</style>
</head>
<body>
<div class="container">
<h1>GC Parser Test Report</h1>
<p class="subtitle">Generated: ${new Date().toLocaleString()}</p>
<!-- Markdown Section -->
<div class="section">
<h2>Markdown Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('md-overview')">Overview</button>
<button class="tab" onclick="showTab('md-original')">Original Content</button>
<button class="tab" onclick="showTab('md-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('md-metadata')">Metadata</button>
</div>
<div id="md-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${markdown.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${markdown.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(markdown.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="md-original" class="tab-content">
<h3>Original Markdown Content</h3>
<div class="code-block">
<pre>${escapeHtml(markdown.original)}</pre>
</div>
</div>
<div id="md-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${markdown.result.content}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(markdown.result.content)}</pre>
</div>
</details>
</div>
<div id="md-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${markdown.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${markdown.result.nostrLinks.length})</h4>
${markdown.result.nostrLinks.map((link) => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${markdown.result.wikilinks.length})</h4>
${markdown.result.wikilinks.map((wl) => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.hashtags.length > 0 ? `
<h4>Hashtags (${markdown.result.hashtags.length})</h4>
${markdown.result.hashtags.map((tag) => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${markdown.result.links.length > 0 ? `
<h4>Links (${markdown.result.links.length})</h4>
${markdown.result.links.map((link) => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${markdown.result.media.length > 0 ? `
<h4>Media URLs (${markdown.result.media.length})</h4>
${markdown.result.media.map((url) => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${markdown.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${markdown.result.tableOfContents}
</div>
` : ''}
</div>
</div>
<!-- AsciiDoc Section -->
<div class="section">
<h2>AsciiDoc Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('ad-overview')">Overview</button>
<button class="tab" onclick="showTab('ad-original')">Original Content</button>
<button class="tab" onclick="showTab('ad-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('ad-metadata')">Metadata</button>
</div>
<div id="ad-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${asciidoc.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${asciidoc.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(asciidoc.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="ad-original" class="tab-content">
<h3>Original AsciiDoc Content</h3>
<div class="code-block">
<pre>${escapeHtml(asciidoc.original)}</pre>
</div>
</div>
<div id="ad-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${asciidoc.result.content}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(asciidoc.result.content)}</pre>
</div>
</details>
</div>
<div id="ad-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${asciidoc.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${asciidoc.result.nostrLinks.length})</h4>
${asciidoc.result.nostrLinks.map((link) => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${asciidoc.result.wikilinks.length})</h4>
${asciidoc.result.wikilinks.map((wl) => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.hashtags.length > 0 ? `
<h4>Hashtags (${asciidoc.result.hashtags.length})</h4>
${asciidoc.result.hashtags.map((tag) => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${asciidoc.result.links.length > 0 ? `
<h4>Links (${asciidoc.result.links.length})</h4>
${asciidoc.result.links.map((link) => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.media.length > 0 ? `
<h4>Media URLs (${asciidoc.result.media.length})</h4>
${asciidoc.result.media.map((url) => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${asciidoc.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${asciidoc.result.tableOfContents}
</div>
` : ''}
</div>
</div>
</div>
<script>
function showTab(tabId) {
// Hide all tab contents
const allContents = document.querySelectorAll('.tab-content');
allContents.forEach(content => content.classList.remove('active'));
// Remove active class from all tabs
const allTabs = document.querySelectorAll('.tab');
allTabs.forEach(tab => tab.classList.remove('active'));
// Show selected tab content
const selectedContent = document.getElementById(tabId);
if (selectedContent) {
selectedContent.classList.add('active');
}
// Add active class to clicked tab
event.target.classList.add('active');
}
</script>
</body>
</html>`;
}
function escapeHtml(text) {
const map = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;',
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}
// Run the script
main().catch((error) => {
console.error('Error generating test report:', error);
process.exit(1);
});

642
generate-test-report.ts

@ -0,0 +1,642 @@ @@ -0,0 +1,642 @@
import { Parser } from './src/parser';
import * as fs from 'fs';
import * as path from 'path';
/**
* Script that parses both markdown and asciidoc test documents
* and generates an HTML report showing the parsing results
*/
interface TestData {
original: string;
result: any;
}
interface ReportData {
markdown: TestData;
asciidoc: TestData;
}
async function main() {
const parser = new Parser({
linkBaseURL: 'https://example.com',
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/notes?t={topic}',
});
console.log('Reading test documents...');
// Read test documents
const markdownContent = fs.readFileSync(
path.join(__dirname, 'markdown_testdoc.md'),
'utf-8'
);
const asciidocContent = fs.readFileSync(
path.join(__dirname, 'asciidoc_testdoc.adoc'),
'utf-8'
);
console.log('Parsing markdown document...');
const markdownResult = await parser.process(markdownContent);
console.log('Parsing asciidoc document...');
const asciidocResult = await parser.process(asciidocContent);
console.log('Generating HTML report...');
// Generate HTML report
const htmlReport = generateHTMLReport({
markdown: {
original: markdownContent,
result: markdownResult,
},
asciidoc: {
original: asciidocContent,
result: asciidocResult,
},
});
// Write HTML report to file (force fresh write)
const reportPath = path.join(__dirname, 'test-report.html');
// Delete old report if it exists to ensure fresh generation
if (fs.existsSync(reportPath)) {
fs.unlinkSync(reportPath);
}
fs.writeFileSync(reportPath, htmlReport, 'utf-8');
const reportUrl = `file://${reportPath}`;
console.log(`\n✅ Test report generated successfully!`);
console.log(` File: ${reportPath}`);
console.log(` Size: ${(htmlReport.length / 1024).toFixed(2)} KB`);
console.log(` Timestamp: ${new Date().toISOString()}`);
console.log(` Open this file in your browser to view the results.\n`);
}
function generateHTMLReport(data: ReportData): string {
const { markdown, asciidoc } = data;
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GC Parser Test Report</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.6;
color: #333;
background: #f5f5f5;
padding: 20px;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
color: #2c3e50;
margin-bottom: 10px;
font-size: 2.5em;
}
.subtitle {
color: #7f8c8d;
margin-bottom: 30px;
font-size: 1.1em;
}
.section {
background: white;
border-radius: 8px;
padding: 30px;
margin-bottom: 30px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.section h2 {
color: #34495e;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #3498db;
font-size: 1.8em;
}
.section h3 {
color: #2c3e50;
margin-top: 25px;
margin-bottom: 15px;
font-size: 1.3em;
}
.tabs {
display: flex;
gap: 10px;
margin-bottom: 20px;
border-bottom: 2px solid #e0e0e0;
}
.tab {
padding: 12px 24px;
background: #f8f9fa;
border: none;
border-top-left-radius: 6px;
border-top-right-radius: 6px;
cursor: pointer;
font-size: 1em;
font-weight: 500;
color: #555;
transition: all 0.2s;
}
.tab:hover {
background: #e9ecef;
}
.tab.active {
background: #3498db;
color: white;
}
.tab-content {
display: none;
}
.tab-content.active {
display: block;
}
.metadata-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 15px;
margin-top: 15px;
}
.metadata-item {
background: #f8f9fa;
padding: 12px;
border-radius: 4px;
border-left: 3px solid #3498db;
}
.metadata-item strong {
color: #2c3e50;
display: block;
margin-bottom: 5px;
}
.metadata-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.9em;
}
.code-block {
background: #2d2d2d;
color: #f8f8f2;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.9em;
line-height: 1.5;
margin: 15px 0;
max-height: 400px;
overflow-y: auto;
}
.code-block pre {
margin: 0;
white-space: pre-wrap;
word-wrap: break-word;
}
.rendered-output {
background: white;
border: 1px solid #ddd;
padding: 20px;
border-radius: 6px;
margin: 15px 0;
min-height: 200px;
}
.rendered-output * {
max-width: 100%;
}
.stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 15px;
margin-top: 20px;
}
.stat-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
.stat-card .number {
font-size: 2.5em;
font-weight: bold;
margin-bottom: 5px;
}
.stat-card .label {
font-size: 0.9em;
opacity: 0.9;
}
.list-item {
background: #f8f9fa;
padding: 8px 12px;
margin: 5px 0;
border-radius: 4px;
border-left: 3px solid #95a5a6;
}
.list-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.85em;
}
.success-badge {
display: inline-block;
background: #27ae60;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.warning-badge {
display: inline-block;
background: #f39c12;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.comparison {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
margin-top: 20px;
}
@media (max-width: 768px) {
.comparison {
grid-template-columns: 1fr;
}
}
.json-view {
background: #f8f9fa;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.85em;
max-height: 300px;
overflow-y: auto;
}
</style>
</head>
<body>
<div class="container">
<h1>GC Parser Test Report</h1>
<p class="subtitle">Generated: ${new Date().toLocaleString()}</p>
<!-- Markdown Section -->
<div class="section">
<h2>Markdown Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('md-overview')">Overview</button>
<button class="tab" onclick="showTab('md-original')">Original Content</button>
<button class="tab" onclick="showTab('md-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('md-metadata')">Metadata</button>
</div>
<div id="md-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${markdown.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${markdown.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(markdown.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="md-original" class="tab-content">
<h3>Original Markdown Content</h3>
<div class="code-block">
<pre>${escapeHtml(markdown.original)}</pre>
</div>
</div>
<div id="md-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${markdown.result.content}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(markdown.result.content)}</pre>
</div>
</details>
</div>
<div id="md-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${markdown.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${markdown.result.nostrLinks.length})</h4>
${markdown.result.nostrLinks.map((link: any) => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${markdown.result.wikilinks.length})</h4>
${markdown.result.wikilinks.map((wl: any) => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.hashtags.length > 0 ? `
<h4>Hashtags (${markdown.result.hashtags.length})</h4>
${markdown.result.hashtags.map((tag: string) => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${markdown.result.links.length > 0 ? `
<h4>Links (${markdown.result.links.length})</h4>
${markdown.result.links.map((link: any) => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${markdown.result.media.length > 0 ? `
<h4>Media URLs (${markdown.result.media.length})</h4>
${markdown.result.media.map((url: string) => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${markdown.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${markdown.result.tableOfContents}
</div>
` : ''}
</div>
</div>
<!-- AsciiDoc Section -->
<div class="section">
<h2>AsciiDoc Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('ad-overview')">Overview</button>
<button class="tab" onclick="showTab('ad-original')">Original Content</button>
<button class="tab" onclick="showTab('ad-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('ad-metadata')">Metadata</button>
</div>
<div id="ad-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${asciidoc.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${asciidoc.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(asciidoc.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="ad-original" class="tab-content">
<h3>Original AsciiDoc Content</h3>
<div class="code-block">
<pre>${escapeHtml(asciidoc.original)}</pre>
</div>
</div>
<div id="ad-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${asciidoc.result.content}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(asciidoc.result.content)}</pre>
</div>
</details>
</div>
<div id="ad-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${asciidoc.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${asciidoc.result.nostrLinks.length})</h4>
${asciidoc.result.nostrLinks.map((link: any) => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${asciidoc.result.wikilinks.length})</h4>
${asciidoc.result.wikilinks.map((wl: any) => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.hashtags.length > 0 ? `
<h4>Hashtags (${asciidoc.result.hashtags.length})</h4>
${asciidoc.result.hashtags.map((tag: string) => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${asciidoc.result.links.length > 0 ? `
<h4>Links (${asciidoc.result.links.length})</h4>
${asciidoc.result.links.map((link: any) => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.media.length > 0 ? `
<h4>Media URLs (${asciidoc.result.media.length})</h4>
${asciidoc.result.media.map((url: string) => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${asciidoc.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${asciidoc.result.tableOfContents}
</div>
` : ''}
</div>
</div>
</div>
<script>
function showTab(tabId) {
// Hide all tab contents
const allContents = document.querySelectorAll('.tab-content');
allContents.forEach(content => content.classList.remove('active'));
// Remove active class from all tabs
const allTabs = document.querySelectorAll('.tab');
allTabs.forEach(tab => tab.classList.remove('active'));
// Show selected tab content
const selectedContent = document.getElementById(tabId);
if (selectedContent) {
selectedContent.classList.add('active');
}
// Add active class to clicked tab
event.target.classList.add('active');
}
</script>
</body>
</html>`;
}
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;',
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}
// Run the script
main().catch((error) => {
console.error('Error generating test report:', error);
process.exit(1);
});

25
jest.config.js

@ -0,0 +1,25 @@ @@ -0,0 +1,25 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
roots: ['<rootDir>'],
testMatch: ['**/*.test.ts'],
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
collectCoverageFrom: [
'src/**/*.ts',
'!src/**/*.d.ts',
],
globals: {
'ts-jest': {
tsconfig: {
target: 'ES2020',
module: 'commonjs',
lib: ['ES2020'],
types: ['node'],
strict: true,
esModuleInterop: true,
skipLibCheck: true,
forceConsistentCasingInFileNames: true,
},
},
},
};

337
markdown_testdoc.md

@ -0,0 +1,337 @@ @@ -0,0 +1,337 @@
---
# this is YAML front matter
author: James Smith
summary: This is a summary
topics: list, of, topics
variable: one
array:
- one thing
- two things
- several things
# all of this data is available to our layout
---
# Markdown Test Document
## Bullet list
This is a test unordered list with mixed bullets:
* First item with a number 2. in it
* Second item
* Third item
- Indented item
- Indented item
* Fourth item
Another unordered list:
- 1st item
- 2nd item
- third item containing _italic_ text
- indented item
- second indented item
- fourth item
This is a test ordered list with indented items:
1. First item
2. Second item
3. Third item
1. Indented item
2. Indented item
4. Fourth item
Ordered list where everything has the same number:
1. First item
1. Second item
1. Third item
1. Fourth item
Ordered list that is wrongly numbered:
1. First item
8. Second item
3. Third item
5. Fourth item
This is a mixed list with indented items:
1. First item
2. Second item
3. Third item
* Indented item
* Indented item
4. Fourth item
This is another mixed list with indented items:
- First item
- Second item
- Third item
1. Indented item
2. Indented item
- Fourth item
## Headers
### Third-level header
#### Fourth-level header
##### Fifth-level header
###### Sixth-level header
## Media and Links
### Nostr address
This should be ignored and rendered as plaintext: naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
This is also plaintext:
npub1gv069u6q7zkl393ad47xutpqmyfj0rrfrlnqnlfc2ld38k8nnl4st9wa6q
These should be turned into links:
nostr:naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
nostr:npub1l5sga6xg72phsz5422ykujprejwud075ggrr3z2hwyrfgr7eylqstegx9z
nostr:nevent1qvzqqqqqqypzp382htsmu08k277ps40wqhnfm60st89h5pvjyutghq9cjasuh38qqythwumn8ghj7un9d3shjtnswf5k6ctv9ehx2ap0qqsysletg3lqnl4uy59xsj4rp9rgw67wg23l827f4uvn5ckn20fuxcq45d8pj
nostr:nprofile1qqsxhedgkuneycxpcdjlg6tgtxdy8gurdz64nq2h0flc288a0jag98qguy3nh
nostr:note1txyefcha2xt3pgungx4k6j077dsteyef6hzpyuuku00s4h0eymzq4k33yg
### Hashtag
#testhashtag at the start of the line and #inlinehashtag in the middle
### Wikilinks
[[NKBIP-01|Specification]] and [[mirepoix]]
### URL
https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html
[Welt Online link](https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html)
this should render as plaintext: `http://www.example.com`
this should be a hyperlink: www.example.com
this shouild be a hyperlink to the http URL with the same address, so wss://theforest.nostr1.com should render like [wss://theforest.nostr1.com](https://theforest.nostr1.com)
### Images
Image: https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
![test image](https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png)
### Media
#### YouTube
https://youtube.com/shorts/ZWfvChb-i0w
[![Youtube link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://youtube.com/shorts/ZWfvChb-i0w)
#### Spotify
https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ
[![Spotify link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ)
#### Audio
https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3
[![Audio link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3)
#### Video
https://v.nostr.build/MTjaYib4upQuf8zn.mp4
[![Video link with pic](https://upload.wikimedia.org/wikipedia/commons/thumb/6/62/YouTube_social_white_square_%282024%29.svg/960px-YouTube_social_white_square_%282024%29.svg.png)](https://v.nostr.build/MTjaYib4upQuf8zn.mp4)
## Tables
### Orderly
| Syntax | Description |
| ----------- | ----------- |
| Header | Title |
| Paragraph | Text |
### Unorderly
| Syntax | Description |
| --- | ----------- |
| Header | Title |
| Paragraph | Text |
### With alignment
| Syntax | Description | Test Text |
| :--- | :----: | ---: |
| Header | Title | Here's this |
| Paragraph | Text | And more |
## Code blocks
### json
```json
{
"id": "<event_id>",
"pubkey": "<event_originator_pubkey>",
"created_at": 1725087283,
"kind": 30040,
"tags": [
["d", "aesop's-fables-by-aesop"],
["title", "Aesop's Fables"],
["author", "Aesop"],
],
"sig": "<event_signature>"
}
```
### typescript
```typescript
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
```
### shell
```shell
mkdir new_directory
cp source.txt destination.txt
```
### LaTeX
```latex
$$
M =
\begin{bmatrix}
\frac{5}{6} & \frac{1}{6} & 0 \\[0.3em]
\frac{5}{6} & 0 & \frac{1}{6} \\[0.3em]
0 & \frac{5}{6} & \frac{1}{6}
\end{bmatrix}
$$
```
```latex
$$
f(x)=
\begin{cases}
1/d_{ij} & \quad \text{when $d_{ij} \leq 160$}\\
0 & \quad \text{otherwise}
\end{cases}
$$
```
### ABC Notation
```abc
X:1
T:Ohne Titel
C:Aufgezeichnet 1784
A:Seibis nahe Lichtenberg in Oberfranken
S:Handschrift, bezeichnet und datiert: "Heinrich Nicol Philipp zu Seibis den 30 Junius 1784"
M:4/4
L:1/4
K:D
dd d2 | ee e2 | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
|:\
fg ad | cB cA | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
```
## LateX
### LaTex in inline-code
`$[ x^n + y^n = z^n \]$` and `$[\sqrt{x^2+1}\]$` and `$\color{blue}{X \sim Normal \; (\mu,\sigma^2)}$`
## LaTex outside of code
This is a latex code block $$\mathbb{N} = \{ a \in \mathbb{Z} : a > 0 \}$$ and another that is an inline latex $\color{green}{X \sim Normal \; (\mu,\sigma^2)}$ and should be green
## Footnotes
Here's a simple footnote,[^1] and here's a longer one.[^bignote]
[^1]: This is the first footnote.
[^bignote]: Here's one with multiple paragraphs and code.
## Anchor links
[Link to bullet list section](#bullet-lists)
## Formatting
### Strikethrough
~~The world is flat.~~ We now know that the world is round. This should not be ~struck~ through.
### Bold
This is *bold* text. So is this **bold** text.
### Italic
This is _italic_ text. So is this __italic__ text.
### Task List
- [x] Write the press release
- [ ] Update the website
- [ ] Contact the media
### Emoji shortcodes
Gone camping! :tent: Be back soon.
That is so funny! :joy:
### Marking and highlighting text
I need to highlight these ==very important words==.
### Subscript and Superscript
H~2~O
X^2^
### Delimiter
based upon a -
---
based upon a *
***
### Quotes
> This is a single line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
> This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
> This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
> This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj

11
package.json

@ -7,6 +7,7 @@ @@ -7,6 +7,7 @@
"scripts": {
"build": "tsc",
"test": "jest",
"test:report": "ts-node generate-test-report.ts",
"prepublishOnly": "npm run build"
},
"keywords": [
@ -21,13 +22,15 @@ @@ -21,13 +22,15 @@
"author": "",
"license": "MIT",
"dependencies": {
"@asciidoctor/core": "^3.0.4"
"@asciidoctor/core": "^3.0.4",
"node-emoji": "^2.2.0"
},
"devDependencies": {
"@types/highlight.js": "^10.1.0",
"@types/jest": "^29.5.11",
"@types/node": "^20.11.0",
"typescript": "^5.3.3",
"jest": "^29.7.0",
"@types/jest": "^29.5.11",
"@types/highlight.js": "^10.1.0"
"ts-jest": "^29.4.6",
"typescript": "^5.3.3"
}
}

692
src/converters/to-asciidoc.js

@ -0,0 +1,692 @@ @@ -0,0 +1,692 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.convertToAsciidoc = convertToAsciidoc;
const types_1 = require("../types");
// Import node-emoji if available (optional dependency)
let emoji;
try {
emoji = require('node-emoji');
}
catch (e) {
// node-emoji not available, emoji conversion will be skipped
emoji = null;
}
/**
* Clean URL by removing tracking parameters
* Based on jumble's cleanUrl function
*/
function cleanUrl(url) {
try {
const parsedUrl = new URL(url);
// List of tracking parameter prefixes and exact names to remove
const trackingParams = [
// Google Analytics & Ads
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
// Facebook
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
// Twitter/X
'twclid', 'twsrc',
// Microsoft/Bing
'msclkid', 'mc_cid', 'mc_eid',
// Adobe
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
// Mailchimp
'mc_cid', 'mc_eid',
// HubSpot
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
// Marketo
'mkt_tok',
// YouTube
'si', 'feature', 'kw', 'pp',
// Other common tracking
'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
// Mobile app tracking
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
// Amazon
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
// Affiliate tracking
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
// Social media share tracking
'share', 'shared', 'sharesource'
];
// Remove all tracking parameters
trackingParams.forEach(param => {
parsedUrl.searchParams.delete(param);
});
// Remove any parameter that starts with utm_ or _
Array.from(parsedUrl.searchParams.keys()).forEach(key => {
if (key.startsWith('utm_') || key.startsWith('_')) {
parsedUrl.searchParams.delete(key);
}
});
return parsedUrl.toString();
}
catch {
// If URL parsing fails, return original URL
return url;
}
}
/**
* Converts content to AsciiDoc format based on detected format
* This is the unified entry point - everything becomes AsciiDoc
*/
function convertToAsciidoc(content, format, linkBaseURL, options = {}) {
let asciidoc = '';
switch (format) {
case types_1.ContentFormat.AsciiDoc:
// For AsciiDoc content, ensure proper formatting
asciidoc = content.replace(/\\n/g, '\n');
// Ensure headers are on their own lines with proper spacing
asciidoc = asciidoc.replace(/(\S[^\n]*)\n(={1,6}\s+[^\n]+)/g, (_match, before, header) => {
return `${before}\n\n${header}`;
});
break;
case types_1.ContentFormat.Wikipedia:
asciidoc = convertWikipediaToAsciidoc(content);
break;
case types_1.ContentFormat.Markdown:
asciidoc = convertMarkdownToAsciidoc(content);
break;
case types_1.ContentFormat.Plain:
default:
asciidoc = convertPlainTextToAsciidoc(content);
break;
}
// Process special elements for all content types
// Process wikilinks
asciidoc = processWikilinks(asciidoc, linkBaseURL);
// Process nostr: addresses if enabled
if (options.enableNostrAddresses !== false) {
asciidoc = processNostrAddresses(asciidoc, linkBaseURL);
}
// Process media URLs in markdown links/images first (before converting to AsciiDoc)
// This ensures media URLs in [text](url) or ![alt](url) format are detected
asciidoc = processMediaUrlsInMarkdown(asciidoc);
// Process media URLs (YouTube, Spotify, video, audio files) - for bare URLs
asciidoc = processMediaUrls(asciidoc);
// Process bare URLs (convert to AsciiDoc links)
asciidoc = processBareUrls(asciidoc);
// Process hashtags (after URLs to avoid conflicts)
asciidoc = processHashtags(asciidoc);
return asciidoc;
}
/**
* Converts Wikipedia markup to AsciiDoc format
* Handles Wikipedia-style headings, links, and formatting
*/
function convertWikipediaToAsciidoc(content) {
let asciidoc = content.replace(/\\n/g, '\n');
// Convert Wikipedia headings: == Heading == to AsciiDoc == Heading
// Wikipedia uses == for level 2, === for level 3, etc.
// AsciiDoc uses = for title, == for level 1, === for level 2, etc.
// So Wikipedia level 2 (==) maps to AsciiDoc level 1 (==)
asciidoc = asciidoc.replace(/^(=+)\s+(.+?)\s+\1$/gm, (match, equals, heading) => {
const level = equals.length - 1; // Count = signs, subtract 1 for AsciiDoc mapping
const asciidocEquals = '='.repeat(level + 1); // AsciiDoc uses one more = for same level
return `${asciidocEquals} ${heading.trim()}`;
});
// Convert Wikipedia bold: ''text'' to AsciiDoc *text*
asciidoc = asciidoc.replace(/''([^']+)''/g, '*$1*');
// Convert Wikipedia italic: 'text' to AsciiDoc _text_
// Be careful not to match apostrophes in words
asciidoc = asciidoc.replace(/(^|[^'])'([^']+)'([^']|$)/g, '$1_$2_$3');
// Convert Wikipedia links: [[Page]] or [[Page|Display]] to wikilinks
// These will be processed by processWikilinks later, but we need to ensure
// they're in the right format. Wikipedia links are already in [[...]] format
// which matches our wikilink format, so they should work as-is.
// Convert Wikipedia external links: [URL text] to AsciiDoc link:URL[text]
asciidoc = asciidoc.replace(/\[(https?:\/\/[^\s\]]+)\s+([^\]]+)\]/g, 'link:$1[$2]');
asciidoc = asciidoc.replace(/\[(https?:\/\/[^\s\]]+)\]/g, 'link:$1[$1]');
// Convert Wikipedia lists (they use * or # similar to Markdown)
// This is handled similarly to Markdown, so we can reuse that logic
// But Wikipedia also uses : for definition lists and ; for term lists
// For now, we'll handle basic lists and let AsciiDoc handle the rest
// Convert horizontal rules: ---- to AsciiDoc '''
asciidoc = asciidoc.replace(/^----+$/gm, "'''");
return asciidoc;
}
/**
* Converts Markdown to AsciiDoc format
* Based on jumble's conversion patterns
*/
function convertMarkdownToAsciidoc(content) {
let asciidoc = content.replace(/\\n/g, '\n');
// Fix spacing issues (but be careful not to break links and images)
// Process these BEFORE converting links/images to avoid conflicts
asciidoc = asciidoc.replace(/`([^`\n]+)`\s*\(([^)]+)\)/g, '`$1` ($2)');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`([a-zA-Z0-9])/g, '$1 `$2` $3');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`\s*\(/g, '$1 `$2` (');
asciidoc = asciidoc.replace(/\)`([^`\n]+)`([a-zA-Z0-9])/g, ') `$1` $2');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2');
// Add space before == but not if it's part of a markdown link pattern
// Check that == is not immediately after ]( which would be a link
asciidoc = asciidoc.replace(/([a-zA-Z0-9])(?<!\]\()==/g, '$1 ==');
// Note: nostr: addresses are processed later in processNostrAddresses
// Convert headers
asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======');
asciidoc = asciidoc.replace(/^#{5}\s+(.+)$/gm, '===== $1 =====');
asciidoc = asciidoc.replace(/^#{4}\s+(.+)$/gm, '==== $1 ====');
asciidoc = asciidoc.replace(/^#{3}\s+(.+)$/gm, '=== $1 ===');
asciidoc = asciidoc.replace(/^#{2}\s+(.+)$/gm, '== $1 ==');
asciidoc = asciidoc.replace(/^#{1}\s+(.+)$/gm, '= $1 =');
asciidoc = asciidoc.replace(/^==\s+(.+?)\s+==$/gm, '== $1 ==');
asciidoc = asciidoc.replace(/\s==\s+([^=]+?)\s+==\s/g, ' == $1 == ');
// Convert emphasis
asciidoc = asciidoc.replace(/\*\*(.+?)\*\*/g, '*$1*'); // Bold
asciidoc = asciidoc.replace(/__(.+?)__/g, '*$1*'); // Bold
asciidoc = asciidoc.replace(/\*(.+?)\*/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/_(.+?)_/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/~~(.+?)~~/g, '[line-through]#$1#'); // Strikethrough
asciidoc = asciidoc.replace(/==(.+?)==/g, '[highlight]#$1#'); // Text highlighting (GFM)
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript
// Convert emoji shortcodes to Unicode (e.g., :tent: -> 🏕)
// Only convert if node-emoji is available
if (emoji && emoji.emojify) {
asciidoc = emoji.emojify(asciidoc);
}
// Convert code blocks (handle both \n and \r\n line endings)
// Special handling for diagram languages: latex, plantuml, puml, bpmn
asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => {
const trimmedCode = code.trim();
if (trimmedCode.length === 0)
return '';
const langLower = lang ? lang.toLowerCase() : '';
// If it's a latex code block, always treat as code (not math)
if (langLower === 'latex') {
return `[source,latex]\n----\n${trimmedCode}\n----`;
}
// Handle PlantUML diagrams
if (langLower === 'plantuml' || langLower === 'puml') {
// Check if it already has @startuml/@enduml or @startbpmn/@endbpmn
if (trimmedCode.includes('@start') || trimmedCode.includes('@end')) {
return `[plantuml]\n----\n${trimmedCode}\n----`;
}
// If not, wrap it in @startuml/@enduml
return `[plantuml]\n----\n@startuml\n${trimmedCode}\n@enduml\n----`;
}
// Handle BPMN diagrams (using PlantUML BPMN syntax)
if (langLower === 'bpmn') {
// Check if it already has @startbpmn/@endbpmn
if (trimmedCode.includes('@startbpmn') && trimmedCode.includes('@endbpmn')) {
return `[plantuml]\n----\n${trimmedCode}\n----`;
}
// If not, wrap it in @startbpmn/@endbpmn
return `[plantuml]\n----\n@startbpmn\n${trimmedCode}\n@endbpmn\n----`;
}
// Check if it's ABC notation (starts with X:)
if (!lang && /^X:\s*\d+/m.test(trimmedCode)) {
// ABC notation - keep as plain text block, will be processed by music processor
return `----\n${trimmedCode}\n----`;
}
const hasCodePatterns = /[{}();=<>]|function|class|import|export|def |if |for |while |return |const |let |var |public |private |static |console\.log/.test(trimmedCode);
const isLikelyText = /^[A-Za-z\s.,!?\-'"]+$/.test(trimmedCode) && trimmedCode.length > 50;
const hasTooManySpaces = (trimmedCode.match(/\s{3,}/g) || []).length > 3;
const hasMarkdownPatterns = /^#{1,6}\s|^\*\s|^\d+\.\s|^\>\s|^\|.*\|/.test(trimmedCode);
if ((!hasCodePatterns && trimmedCode.length > 100) || isLikelyText || hasTooManySpaces || hasMarkdownPatterns) {
return _match;
}
return `[source${lang ? ',' + lang : ''}]\n----\n${trimmedCode}\n----`;
});
// Handle inline code: LaTeX formulas in inline code should be rendered as math
// Pattern: `$formula$` should become $formula$ (math), not code
// Handle escaped brackets: `$[ ... \]$` and `$[\sqrt{...}\]$`
asciidoc = asciidoc.replace(/`(\$[^`]+\$)`/g, (match, formula) => {
// Extract the formula (remove the $ signs)
const mathContent = formula.slice(1, -1);
return `$${mathContent}$`; // Return as math, not code
});
asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`'); // Regular inline code
// Convert nested image links first: [![alt](img)](url) - image wrapped in link
// This must come before regular image processing
asciidoc = asciidoc.replace(/\[!\[([^\]]*)\]\(([^)]+?)\)\]\(([^)]+?)\)/g, (match, alt, imgUrl, linkUrl) => {
const cleanImgUrl = imgUrl.trim();
const cleanLinkUrl = linkUrl.trim();
const cleanAlt = alt.trim();
// Check if linkUrl is a media URL
if (cleanLinkUrl.startsWith('MEDIA:')) {
return cleanLinkUrl; // Return the placeholder as-is
}
// Create a link with an image inside - don't escape brackets in URLs
// AsciiDoc can handle URLs with brackets if they're in the URL part
return `link:${cleanLinkUrl}[image:${cleanImgUrl}[${cleanAlt ? cleanAlt : 'link'}]]`;
});
// Convert images (but not nested ones, which we already processed)
// Match: ![alt text](url) or ![](url) - handle empty alt text
// Use negative lookbehind to avoid matching nested image links
// Format: image::url[alt,width=100%] - matching jumble's format
asciidoc = asciidoc.replace(/(?<!\[)!\[([^\]]*)\]\(([^)]+?)\)/g, (match, alt, url) => {
let processedUrl = url.trim();
const cleanAlt = alt.trim();
// Check if it's already a MEDIA: placeholder (processed by processMediaUrlsInMarkdown)
if (processedUrl.startsWith('MEDIA:')) {
return processedUrl; // Return the placeholder as-is
}
// Clean URL (remove tracking parameters)
processedUrl = cleanUrl(processedUrl);
// Regular image - match jumble's format: image::url[alt,width=100%]
// Don't escape brackets - AsciiDoc handles URLs properly
return `image::${processedUrl}[${cleanAlt ? cleanAlt + ',' : ''}width=100%]`;
});
// Convert anchor links: [text](#section-id) - these are internal links
asciidoc = asciidoc.replace(/(?<!!)\[([^\]]+)\]\(#([^)]+)\)/g, (match, text, anchor) => {
const cleanText = text.trim();
const cleanAnchor = anchor.trim();
// AsciiDoc uses # for anchor links, but we need to normalize the anchor ID
// Convert to lowercase and replace spaces/special chars with hyphens
const normalizedAnchor = cleanAnchor.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '');
const escapedText = cleanText.replace(/([\[\]])/g, '\\$1');
return `<<${normalizedAnchor},${escapedText}>>`;
});
// Convert links (but not images or anchor links, which we already processed)
// Match: [text](url) - use negative lookbehind to avoid matching images
// Use non-greedy matching for URL to stop at first closing paren
// This ensures we don't capture trailing punctuation
asciidoc = asciidoc.replace(/(?<!!)\[([^\]]+)\]\(([^)]+?)\)/g, (match, text, url) => {
let processedUrl = url.trim();
const cleanText = text.trim();
// Check if it's already a MEDIA: placeholder (processed by processMediaUrlsInMarkdown)
if (processedUrl.startsWith('MEDIA:')) {
return processedUrl; // Return the placeholder as-is
}
// Clean URL (remove tracking parameters)
processedUrl = cleanUrl(processedUrl);
// Handle WSS URLs: convert wss:// to https:// for display
if (processedUrl.startsWith('wss://')) {
processedUrl = processedUrl.replace(/^wss:\/\//, 'https://');
}
// Regular link - don't escape brackets in URLs (AsciiDoc handles them)
// Only escape brackets in the link text if needed
const escapedText = cleanText.replace(/([\[\]])/g, '\\$1');
return `link:${processedUrl}[${escapedText}]`;
});
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\'\'\'');
asciidoc = asciidoc.replace(/^\*\*\*$/gm, '\'\'\''); // Also handle ***
// Convert lists - need to process them as blocks to preserve structure
// First, convert task lists (before regular lists)
// Task lists: - [x] or - [ ] or * [x] or * [ ]
asciidoc = asciidoc.replace(/^(\s*)([-*])\s+\[([ x])\]\s+(.+)$/gm, (_match, indent, bullet, checked, text) => {
// Use AsciiDoc checkbox syntax: * [x] Task text
// The checkbox will be rendered by AsciiDoctor
return `${indent}* [${checked === 'x' ? 'x' : ' '}] ${text}`;
});
// Convert lists - process entire list blocks to ensure proper AsciiDoc formatting
// AsciiDoc lists need to be on their own lines with proper spacing
// Process lists in blocks to handle nested lists correctly
const lines = asciidoc.split('\n');
const processedLines = [];
let inList = false;
let listType = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const isEmpty = line.trim() === '';
const prevLine = i > 0 ? processedLines[processedLines.length - 1] : '';
const prevLineIsEmpty = prevLine.trim() === '';
// Check if this line is a list item (but not a task list, which we already processed)
const unorderedMatch = line.match(/^(\s*)([-*+])\s+(.+)$/);
const orderedMatch = line.match(/^(\s*)(\d+)\.\s+(.+)$/);
const isTaskList = line.match(/^(\s*)([-*])\s+\[([ x])\]\s+(.+)$/);
if (unorderedMatch && !isTaskList) {
const [, indent, , text] = unorderedMatch;
const indentLevel = indent.length;
// AsciiDoc uses 4 spaces per indentation level
// Markdown typically uses 2 or 4 spaces per level
// 2 spaces = 1 level (4 spaces), 4 spaces = 1 level (4 spaces)
const asciidocIndent = ' '.repeat(Math.ceil(indentLevel / 4));
// Add blank line before list if not already in a list
// But don't add blank line if we're switching list types within the same list context
if (!inList) {
// Starting a new list - add blank line if previous line has content
if (processedLines.length > 0 && !prevLineIsEmpty) {
processedLines.push('');
}
inList = true;
listType = 'unordered';
}
else if (listType !== 'unordered') {
// Switching list types - don't add blank line, just change type
listType = 'unordered';
}
processedLines.push(`${asciidocIndent}* ${text}`);
}
else if (orderedMatch) {
const [, indent, , text] = orderedMatch;
const indentLevel = indent.length;
// AsciiDoc uses 4 spaces per indentation level
// Markdown typically uses 2 or 4 spaces per level
// 2 spaces = 1 level (4 spaces), 4 spaces = 1 level (4 spaces)
const asciidocIndent = ' '.repeat(Math.ceil(indentLevel / 4));
// Add blank line before list if not already in a list
// But don't add blank line if we're switching list types within the same list context
if (!inList) {
// Starting a new list - add blank line if previous line has content
if (processedLines.length > 0 && !prevLineIsEmpty) {
processedLines.push('');
}
inList = true;
listType = 'ordered';
}
else if (listType !== 'ordered') {
// Switching list types - don't add blank line, just change type
listType = 'ordered';
}
processedLines.push(`${asciidocIndent}. ${text}`);
}
else {
// Not a list item
if (inList && !isEmpty) {
// End of list - add blank line after if the next line is not empty
if (i < lines.length - 1 && lines[i + 1].trim() !== '') {
processedLines.push('');
}
inList = false;
listType = null;
}
processedLines.push(line);
}
}
asciidoc = processedLines.join('\n');
// Convert blockquotes with attribution
asciidoc = asciidoc.replace(/^(>\s+.+(?:\n>\s+.+)*)/gm, (match) => {
const lines = match.split('\n').map(line => line.replace(/^>\s*/, ''));
let quoteBodyLines = [];
let attributionLine;
for (let i = lines.length - 1; i >= 0; i--) {
const line = lines[i].trim();
if (line.startsWith('—') || line.startsWith('--')) {
attributionLine = line;
quoteBodyLines = lines.slice(0, i);
break;
}
}
const quoteContent = quoteBodyLines.filter(l => l.trim() !== '').join('\n').trim();
if (attributionLine) {
let cleanedAttribution = attributionLine.replace(/^[—-]+/, '').trim();
let author = '';
let source = '';
const linkMatch = cleanedAttribution.match(/^(.*?),?\s*link:([^[\\]]+)\[([^\\]]+)\]$/);
if (linkMatch) {
author = linkMatch[1].trim();
source = `link:${linkMatch[2].trim()}[${linkMatch[3].trim()}]`;
}
else {
const parts = cleanedAttribution.split(',').map(p => p.trim());
author = parts[0];
if (parts.length > 1) {
source = parts.slice(1).join(', ').trim();
}
}
return `[quote, ${author}, ${source}]\n____\n${quoteContent}\n____`;
}
else {
return `____\n${quoteContent}\n____`;
}
});
// Convert tables with alignment support
asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|:]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => {
const lines = match.trim().split('\n').filter(line => line.trim());
if (lines.length < 2)
return match;
const headerRow = lines[0];
const separatorRow = lines[1];
const dataRows = lines.slice(2);
if (!separatorRow.includes('-'))
return match;
// Parse alignment from separator row
// :--- = left, :----: = center, ---: = right, --- = default
const cells = separatorRow.split('|').filter(c => c.trim());
const alignments = [];
cells.forEach((cell, index) => {
const trimmed = cell.trim();
if (trimmed.startsWith(':') && trimmed.endsWith(':')) {
alignments[index] = '^'; // center (AsciiDoc uses ^ for center)
}
else if (trimmed.endsWith(':')) {
alignments[index] = '>'; // right
}
else if (trimmed.startsWith(':')) {
alignments[index] = '<'; // left (explicit)
}
else {
alignments[index] = '<'; // default left
}
});
// Build cols attribute with alignments
const colsAttr = alignments.length > 0
? `[cols="${alignments.join(',')}"]`
: '';
let tableAsciidoc = colsAttr ? `${colsAttr}\n` : '';
tableAsciidoc += '|===\n';
tableAsciidoc += headerRow + '\n';
dataRows.forEach(row => {
tableAsciidoc += row + '\n';
});
tableAsciidoc += '|===';
return tableAsciidoc;
});
// Convert footnotes
const footnoteDefinitions = {};
let tempAsciidoc = asciidoc;
tempAsciidoc = tempAsciidoc.replace(/^\[\^([^\]]+)\]:\s*([\s\S]*?)(?=\n\[\^|\n---|\n##|\n###|\n####|\n#####|\n######|$)/gm, (_, id, text) => {
footnoteDefinitions[id] = text.trim();
return '';
});
asciidoc = tempAsciidoc.replace(/\[\^([^\]]+)\]/g, (match, id) => {
if (footnoteDefinitions[id]) {
return `footnote:[${footnoteDefinitions[id]}]`;
}
return match;
});
return asciidoc;
}
/**
* Converts plain text to AsciiDoc format
* Preserves line breaks by converting single newlines to line continuations
*/
function convertPlainTextToAsciidoc(content) {
// Preserve double newlines (paragraph breaks)
// Convert single newlines to line continuations ( +\n)
return content
.replace(/\r\n/g, '\n') // Normalize line endings
.replace(/\n\n+/g, '\n\n') // Normalize multiple newlines to double
.replace(/([^\n])\n([^\n])/g, '$1 +\n$2'); // Single newlines become line continuations
}
/**
* Normalizes text to d-tag format
*/
function normalizeDtag(text) {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Processes wikilinks: [[target]] or [[target|display text]]
* Converts to WIKILINK: placeholder format to protect from AsciiDoc processing
*/
function processWikilinks(content, linkBaseURL) {
// Process bookstr macro wikilinks: [[book::...]]
content = content.replace(/\[\[book::([^\]]+)\]\]/g, (_match, bookContent) => {
const cleanContent = bookContent.trim();
return `BOOKSTR:${cleanContent}`;
});
// Process standard wikilinks: [[Target Page]] or [[target page|see this]]
// Use placeholder format to prevent AsciiDoc from processing the brackets
content = content.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, displayText) => {
const cleanTarget = target.trim();
const cleanDisplay = displayText ? displayText.trim() : cleanTarget;
const dTag = normalizeDtag(cleanTarget);
// Use placeholder format: WIKILINK:dtag|display
// This prevents AsciiDoc from interpreting the brackets
return `WIKILINK:${dTag}|${cleanDisplay}`;
});
return content;
}
/**
* Processes nostr: addresses
* Only processes addresses with "nostr:" prefix - bare addresses are left as plaintext
* Converts to link:nostr:...[...] format
* Valid bech32 prefixes: npub, nprofile, nevent, naddr, note
*/
function processNostrAddresses(content, linkBaseURL) {
// Match nostr: followed by valid bech32 prefix and identifier
// Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers)
// Only match if it has "nostr:" prefix - bare addresses should remain as plaintext
const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi;
return content.replace(nostrPattern, (_match, bech32Id) => {
return `link:nostr:${bech32Id}[${bech32Id}]`;
});
}
/**
* Processes media URLs in markdown links and images
* Converts them to MEDIA: placeholders before markdown conversion
*/
function processMediaUrlsInMarkdown(content) {
let processed = content;
// Process YouTube URLs in markdown links: [text](youtube-url)
processed = processed.replace(/\[([^\]]+)\]\((?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:watch\?v=|embed\/|v\/)|youtu\.be\/)([a-zA-Z0-9_-]{11})(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, text, videoId) => {
return `MEDIA:youtube:${videoId}`;
});
// Process Spotify URLs in markdown links: [text](spotify-url)
processed = processed.replace(/\[([^\]]+)\]\((?:https?:\/\/)?(?:open\.)?spotify\.com\/(track|album|playlist|artist|episode|show)\/([a-zA-Z0-9]+)(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, text, type, id) => {
return `MEDIA:spotify:${type}:${id}`;
});
// Process video files in markdown links/images: [text](video-url) or ![alt](video-url)
processed = processed.replace(/[!]?\[([^\]]*)\]\((https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv))(?:\?[^\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, altOrText, url) => {
const cleanUrl = url.replace(/\?.*$/, ''); // Remove query params
return `MEDIA:video:${cleanUrl}`;
});
// Process audio files in markdown links/images: [text](audio-url) or ![alt](audio-url)
processed = processed.replace(/[!]?\[([^\]]*)\]\((https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(mp3|m4a|ogg|wav|flac|aac|opus|wma))(?:\?[^\s<>"{}|\\^`\[\]()]*)?\)/gi, (_match, altOrText, url) => {
const cleanUrl = url.replace(/\?.*$/, ''); // Remove query params
return `MEDIA:audio:${cleanUrl}`;
});
return processed;
}
/**
* Processes media URLs (YouTube, Spotify, video, audio files) in bare URLs
* Converts them to placeholders that will be rendered as embeds/players
*/
function processMediaUrls(content) {
// Process YouTube URLs
// Match: youtube.com/watch?v=, youtu.be/, youtube.com/embed/, youtube.com/v/
content = content.replace(/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:watch\?v=|embed\/|v\/)|youtu\.be\/)([a-zA-Z0-9_-]{11})(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?/gi, (match, videoId) => {
return `MEDIA:youtube:${videoId}`;
});
// Process Spotify URLs
// Match: open.spotify.com/track/, open.spotify.com/album/, open.spotify.com/playlist/, open.spotify.com/artist/
content = content.replace(/(?:https?:\/\/)?(?:open\.)?spotify\.com\/(track|album|playlist|artist|episode|show)\/([a-zA-Z0-9]+)(?:[?&][^?\s<>"{}|\\^`\[\]()]*)?/gi, (match, type, id) => {
return `MEDIA:spotify:${type}:${id}`;
});
// Process video files (mp4, webm, ogg, m4v, mov, avi, etc.)
content = content.replace(/(?:https?:\/\/[^\s<>"{}|\\^`\[\]()]+)\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv)(?:\?[^\s<>"{}|\\^`\[\]()]*)?/gi, (match, ext) => {
const url = match.replace(/\?.*$/, ''); // Remove query params for cleaner URL
return `MEDIA:video:${url}`;
});
// Process audio files (mp3, m4a, ogg, wav, flac, aac, etc.)
content = content.replace(/(?:https?:\/\/[^\s<>"{}|\\^`\[\]()]+)\.(mp3|m4a|ogg|wav|flac|aac|opus|wma)(?:\?[^\s<>"{}|\\^`\[\]()]*)?/gi, (match, ext) => {
const url = match.replace(/\?.*$/, ''); // Remove query params for cleaner URL
return `MEDIA:audio:${url}`;
});
return content;
}
/**
* Processes bare URLs and converts them to AsciiDoc links
* Matches http://, https://, wss://, and www. URLs that aren't already in markdown links
* Also handles bare image URLs (converts to images)
* Skips URLs inside code blocks (---- blocks) and inline code (backticks)
*/
function processBareUrls(content) {
// Protect code blocks and inline code from URL processing
// We'll process URLs, then restore code blocks
const codeBlockPlaceholders = [];
const inlineCodePlaceholders = [];
// Replace code blocks with placeholders
content = content.replace(/\[source[^\]]*\]\n----\n([\s\S]*?)\n----/g, (match, code) => {
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Also handle plain code blocks (without [source])
content = content.replace(/----\n([\s\S]*?)\n----/g, (match, code) => {
// Check if this is already a placeholder
if (match.includes('__CODEBLOCK_')) {
return match;
}
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Replace inline code with placeholders
content = content.replace(/`([^`]+)`/g, (match, code) => {
const placeholder = `__INLINECODE_${inlineCodePlaceholders.length}__`;
inlineCodePlaceholders.push(match);
return placeholder;
});
// First, handle bare image URLs (before regular URLs)
// Match image URLs: .jpg, .png, .gif, .webp, .svg, etc.
// Format: image::url[width=100%] - matching jumble's format
const imageUrlPattern = /(?<!\]\()\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(jpe?g|png|gif|webp|svg|bmp|ico))(?:\?[^\s<>"{}|\\^`\[\]()]*)?/gi;
content = content.replace(imageUrlPattern, (match, url) => {
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Don't escape brackets - AsciiDoc handles URLs properly
return `image::${cleanedUrl}[width=100%]`;
});
// Match URLs that aren't already in markdown link format
// Pattern: http://, https://, wss://, or www. followed by valid URL characters
// Use word boundary to avoid matching URLs that are part of other text
// Don't match if immediately after colon-space (like "hyperlink: www.example.com")
const urlPattern = /(?<!\]\()(?<!:\s)\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+|wss:\/\/[^\s<>"{}|\\^`\[\]()]+|www\.[^\s<>"{}|\\^`\[\]()]+)/gi;
content = content.replace(urlPattern, (match, url) => {
// Skip if this URL was already converted to an image
if (match.includes('image::')) {
return match;
}
// Ensure URL starts with http:// or https://
let fullUrl = url;
if (url.startsWith('www.')) {
fullUrl = 'https://' + url;
}
else if (url.startsWith('wss://')) {
// Convert wss:// to https:// for display
fullUrl = url.replace(/^wss:\/\//, 'https://');
}
// Clean URL (remove tracking parameters)
fullUrl = cleanUrl(fullUrl);
// Don't escape brackets in URLs - AsciiDoc handles them properly
// The URL is in the link: part, brackets in URLs are valid
// Use proper AsciiDoc link syntax: link:url[text]
return `link:${fullUrl}[${url}]`;
});
// Restore inline code
inlineCodePlaceholders.forEach((code, index) => {
content = content.replace(`__INLINECODE_${index}__`, code);
});
// Restore code blocks
codeBlockPlaceholders.forEach((code, index) => {
content = content.replace(`__CODEBLOCK_${index}__`, code);
});
return content;
}
/**
* Processes hashtags
* Converts to hashtag:tag[#tag] format
* Handles hashtags at the beginning of lines to prevent line breaks
*/
function processHashtags(content) {
// Match # followed by word characters
// Match at word boundary OR at start of line OR after whitespace
// This ensures we don't match # in URLs or code, but do match at line start
return content.replace(/(^|\s|>)#([a-zA-Z0-9_]+)(?![a-zA-Z0-9_])/g, (match, before, hashtag) => {
const normalizedHashtag = hashtag.toLowerCase();
// Preserve the space or line start before the hashtag to prevent line breaks
// Add a zero-width space or ensure proper spacing
const prefix = before === '' ? '' : before;
return `${prefix}hashtag:${normalizedHashtag}[#${hashtag}]`;
});
}

423
src/converters/to-asciidoc.ts

@ -1,5 +1,89 @@ @@ -1,5 +1,89 @@
import { ContentFormat } from '../types';
// Import node-emoji if available (optional dependency)
let emoji: any;
try {
emoji = require('node-emoji');
} catch (e) {
// node-emoji not available, emoji conversion will be skipped
emoji = null;
}
/**
* Clean URL by removing tracking parameters
* Based on jumble's cleanUrl function
*/
function cleanUrl(url: string): string {
try {
const parsedUrl = new URL(url);
// List of tracking parameter prefixes and exact names to remove
const trackingParams = [
// Google Analytics & Ads
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
// Facebook
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
// Twitter/X
'twclid', 'twsrc',
// Microsoft/Bing
'msclkid', 'mc_cid', 'mc_eid',
// Adobe
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
// Mailchimp
'mc_cid', 'mc_eid',
// HubSpot
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
// Marketo
'mkt_tok',
// YouTube
'si', 'feature', 'kw', 'pp',
// Other common tracking
'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
// Mobile app tracking
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
// Amazon
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
// Affiliate tracking
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
// Social media share tracking
'share', 'shared', 'sharesource'
];
// Remove all tracking parameters
trackingParams.forEach(param => {
parsedUrl.searchParams.delete(param);
});
// Remove any parameter that starts with utm_ or _
Array.from(parsedUrl.searchParams.keys()).forEach(key => {
if (key.startsWith('utm_') || key.startsWith('_')) {
parsedUrl.searchParams.delete(key);
}
});
return parsedUrl.toString();
} catch {
// If URL parsing fails, return original URL
return url;
}
}
export interface ConvertOptions {
enableNostrAddresses?: boolean;
}
@ -146,14 +230,55 @@ function convertMarkdownToAsciidoc(content: string): string { @@ -146,14 +230,55 @@ function convertMarkdownToAsciidoc(content: string): string {
asciidoc = asciidoc.replace(/\*(.+?)\*/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/_(.+?)_/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/~~(.+?)~~/g, '[line-through]#$1#'); // Strikethrough
asciidoc = asciidoc.replace(/==(.+?)==/g, '[highlight]#$1#'); // Text highlighting (GFM)
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript
// Convert emoji shortcodes to Unicode (e.g., :tent: -> 🏕)
// Only convert if node-emoji is available
if (emoji && emoji.emojify) {
asciidoc = emoji.emojify(asciidoc);
}
// Convert code blocks (handle both \n and \r\n line endings)
// Special handling for diagram languages: latex, plantuml, puml, bpmn
asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => {
const trimmedCode = code.trim();
if (trimmedCode.length === 0) return '';
const langLower = lang ? lang.toLowerCase() : '';
// If it's a latex code block, always treat as code (not math)
if (langLower === 'latex') {
return `[source,latex]\n----\n${trimmedCode}\n----`;
}
// Handle PlantUML diagrams
if (langLower === 'plantuml' || langLower === 'puml') {
// Check if it already has @startuml/@enduml or @startbpmn/@endbpmn
if (trimmedCode.includes('@start') || trimmedCode.includes('@end')) {
return `[plantuml]\n----\n${trimmedCode}\n----`;
}
// If not, wrap it in @startuml/@enduml
return `[plantuml]\n----\n@startuml\n${trimmedCode}\n@enduml\n----`;
}
// Handle BPMN diagrams (using PlantUML BPMN syntax)
if (langLower === 'bpmn') {
// Check if it already has @startbpmn/@endbpmn
if (trimmedCode.includes('@startbpmn') && trimmedCode.includes('@endbpmn')) {
return `[plantuml]\n----\n${trimmedCode}\n----`;
}
// If not, wrap it in @startbpmn/@endbpmn
return `[plantuml]\n----\n@startbpmn\n${trimmedCode}\n@endbpmn\n----`;
}
// Check if it's ABC notation (starts with X:)
if (!lang && /^X:\s*\d+/m.test(trimmedCode)) {
// ABC notation - keep as plain text block, will be processed by music processor
return `----\n${trimmedCode}\n----`;
}
const hasCodePatterns = /[{}();=<>]|function|class|import|export|def |if |for |while |return |const |let |var |public |private |static |console\.log/.test(trimmedCode);
const isLikelyText = /^[A-Za-z\s.,!?\-'"]+$/.test(trimmedCode) && trimmedCode.length > 50;
const hasTooManySpaces = (trimmedCode.match(/\s{3,}/g) || []).length > 3;
@ -165,55 +290,186 @@ function convertMarkdownToAsciidoc(content: string): string { @@ -165,55 +290,186 @@ function convertMarkdownToAsciidoc(content: string): string {
return `[source${lang ? ',' + lang : ''}]\n----\n${trimmedCode}\n----`;
});
asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`'); // Inline code
asciidoc = asciidoc.replace(/`\$([^$]+)\$`/g, '`$\\$1\\$$`'); // Preserve LaTeX in code
// Handle inline code: LaTeX formulas in inline code should be rendered as math
// Pattern: `$formula$` should become $formula$ (math), not code
// Handle escaped brackets: `$[ ... \]$` and `$[\sqrt{...}\]$`
asciidoc = asciidoc.replace(/`(\$[^`]+\$)`/g, (match, formula) => {
// Extract the formula (remove the $ signs)
const mathContent = formula.slice(1, -1);
return `$${mathContent}$`; // Return as math, not code
});
asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`'); // Regular inline code
// Convert nested image links first: [![alt](img)](url) - image wrapped in link
// This must come before regular image processing
asciidoc = asciidoc.replace(/\[!\[([^\]]*)\]\(([^)]+?)\)\]\(([^)]+?)\)/g, (match, alt, imgUrl, linkUrl) => {
const cleanImgUrl = imgUrl.trim();
const cleanLinkUrl = linkUrl.trim();
const cleanAlt = alt.trim();
// Check if linkUrl is a media URL
if (cleanLinkUrl.startsWith('MEDIA:')) {
return cleanLinkUrl; // Return the placeholder as-is
}
// Create a link with an image inside - don't escape brackets in URLs
// AsciiDoc can handle URLs with brackets if they're in the URL part
return `link:${cleanLinkUrl}[image:${cleanImgUrl}[${cleanAlt ? cleanAlt : 'link'}]]`;
});
// Convert images first (before links, since images are links with ! prefix)
// Convert images (but not nested ones, which we already processed)
// Match: ![alt text](url) or ![](url) - handle empty alt text
// Use non-greedy matching to stop at first closing paren
asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+?)\)/g, (match, alt, url) => {
const cleanUrl = url.trim();
// Use negative lookbehind to avoid matching nested image links
// Format: image::url[alt,width=100%] - matching jumble's format
asciidoc = asciidoc.replace(/(?<!\[)!\[([^\]]*)\]\(([^)]+?)\)/g, (match, alt, url) => {
let processedUrl = url.trim();
const cleanAlt = alt.trim();
// Check if it's already a MEDIA: placeholder (processed by processMediaUrlsInMarkdown)
if (cleanUrl.startsWith('MEDIA:')) {
return cleanUrl; // Return the placeholder as-is
if (processedUrl.startsWith('MEDIA:')) {
return processedUrl; // Return the placeholder as-is
}
// Regular image - escape special characters in URL for AsciiDoc
const escapedUrl = cleanUrl.replace(/([\[\]])/g, '\\$1');
return `image::${escapedUrl}[${cleanAlt ? cleanAlt + ', ' : ''}width=100%]`;
// Clean URL (remove tracking parameters)
processedUrl = cleanUrl(processedUrl);
// Regular image - match jumble's format: image::url[alt,width=100%]
// Don't escape brackets - AsciiDoc handles URLs properly
return `image::${processedUrl}[${cleanAlt ? cleanAlt + ',' : ''}width=100%]`;
});
// Convert anchor links: [text](#section-id) - these are internal links
asciidoc = asciidoc.replace(/(?<!!)\[([^\]]+)\]\(#([^)]+)\)/g, (match, text, anchor) => {
const cleanText = text.trim();
const cleanAnchor = anchor.trim();
// AsciiDoc uses # for anchor links, but we need to normalize the anchor ID
// Convert to lowercase and replace spaces/special chars with hyphens
const normalizedAnchor = cleanAnchor.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '');
const escapedText = cleanText.replace(/([\[\]])/g, '\\$1');
return `<<${normalizedAnchor},${escapedText}>>`;
});
// Convert links (but not images, which we already processed)
// Convert links (but not images or anchor links, which we already processed)
// Match: [text](url) - use negative lookbehind to avoid matching images
// Use non-greedy matching for URL to stop at first closing paren
// This ensures we don't capture trailing punctuation
asciidoc = asciidoc.replace(/(?<!!)\[([^\]]+)\]\(([^)]+?)\)/g, (match, text, url) => {
const cleanUrl = url.trim();
let processedUrl = url.trim();
const cleanText = text.trim();
// Check if it's already a MEDIA: placeholder (processed by processMediaUrlsInMarkdown)
if (cleanUrl.startsWith('MEDIA:')) {
return cleanUrl; // Return the placeholder as-is
if (processedUrl.startsWith('MEDIA:')) {
return processedUrl; // Return the placeholder as-is
}
// Regular link - escape special AsciiDoc characters in both URL and text
const escapedUrl = cleanUrl.replace(/([\[\]])/g, '\\$1');
// Clean URL (remove tracking parameters)
processedUrl = cleanUrl(processedUrl);
// Handle WSS URLs: convert wss:// to https:// for display
if (processedUrl.startsWith('wss://')) {
processedUrl = processedUrl.replace(/^wss:\/\//, 'https://');
}
// Regular link - don't escape brackets in URLs (AsciiDoc handles them)
// Only escape brackets in the link text if needed
const escapedText = cleanText.replace(/([\[\]])/g, '\\$1');
return `link:${escapedUrl}[${escapedText}]`;
return `link:${processedUrl}[${escapedText}]`;
});
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\'\'\'');
asciidoc = asciidoc.replace(/^\*\*\*$/gm, '\'\'\''); // Also handle ***
// Convert lists - need to process them as blocks to preserve structure
// First, convert task lists (before regular lists)
// Task lists: - [x] or - [ ] or * [x] or * [ ]
asciidoc = asciidoc.replace(/^(\s*)([-*])\s+\[([ x])\]\s+(.+)$/gm, (_match, indent, bullet, checked, text) => {
// Use AsciiDoc checkbox syntax: * [x] Task text
// The checkbox will be rendered by AsciiDoctor
return `${indent}* [${checked === 'x' ? 'x' : ' '}] ${text}`;
});
// Convert unordered lists
asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2');
asciidoc = asciidoc.replace(/^(\s*)-\s+(.+)$/gm, '$1* $2');
asciidoc = asciidoc.replace(/^(\s*)\+\s+(.+)$/gm, '$1* $2');
// Convert ordered lists
asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2');
// Convert lists - process entire list blocks to ensure proper AsciiDoc formatting
// AsciiDoc lists need to be on their own lines with proper spacing
// Process lists in blocks to handle nested lists correctly
const lines = asciidoc.split('\n');
const processedLines: string[] = [];
let inList = false;
let listType: 'unordered' | 'ordered' | null = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const isEmpty = line.trim() === '';
const prevLine = i > 0 ? processedLines[processedLines.length - 1] : '';
const prevLineIsEmpty = prevLine.trim() === '';
// Check if this line is a list item (but not a task list, which we already processed)
const unorderedMatch = line.match(/^(\s*)([-*+])\s+(.+)$/);
const orderedMatch = line.match(/^(\s*)(\d+)\.\s+(.+)$/);
const isTaskList = line.match(/^(\s*)([-*])\s+\[([ x])\]\s+(.+)$/);
if (unorderedMatch && !isTaskList) {
const [, indent, , text] = unorderedMatch;
const indentLevel = indent.length;
// AsciiDoc uses 4 spaces per indentation level
// Markdown typically uses 2 or 4 spaces per level
// 2 spaces = 1 level (4 spaces), 4 spaces = 1 level (4 spaces)
const asciidocIndent = ' '.repeat(Math.ceil(indentLevel / 4));
// Add blank line before list if not already in a list
// But don't add blank line if we're switching list types within the same list context
if (!inList) {
// Starting a new list - add blank line if previous line has content
if (processedLines.length > 0 && !prevLineIsEmpty) {
processedLines.push('');
}
inList = true;
listType = 'unordered';
} else if (listType !== 'unordered') {
// Switching list types - don't add blank line, just change type
listType = 'unordered';
}
processedLines.push(`${asciidocIndent}* ${text}`);
} else if (orderedMatch) {
const [, indent, , text] = orderedMatch;
const indentLevel = indent.length;
// AsciiDoc uses 4 spaces per indentation level
// Markdown typically uses 2 or 4 spaces per level
// 2 spaces = 1 level (4 spaces), 4 spaces = 1 level (4 spaces)
const asciidocIndent = ' '.repeat(Math.ceil(indentLevel / 4));
// Add blank line before list if not already in a list
// But don't add blank line if we're switching list types within the same list context
if (!inList) {
// Starting a new list - add blank line if previous line has content
if (processedLines.length > 0 && !prevLineIsEmpty) {
processedLines.push('');
}
inList = true;
listType = 'ordered';
} else if (listType !== 'ordered') {
// Switching list types - don't add blank line, just change type
listType = 'ordered';
}
processedLines.push(`${asciidocIndent}. ${text}`);
} else {
// Not a list item
if (inList && !isEmpty) {
// End of list - add blank line after if the next line is not empty
if (i < lines.length - 1 && lines[i + 1].trim() !== '') {
processedLines.push('');
}
inList = false;
listType = null;
}
processedLines.push(line);
}
}
asciidoc = processedLines.join('\n');
// Convert blockquotes with attribution
asciidoc = asciidoc.replace(/^(>\s+.+(?:\n>\s+.+)*)/gm, (match) => {
@ -258,8 +514,8 @@ function convertMarkdownToAsciidoc(content: string): string { @@ -258,8 +514,8 @@ function convertMarkdownToAsciidoc(content: string): string {
}
});
// Convert tables
asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => {
// Convert tables with alignment support
asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|:]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => {
const lines = match.trim().split('\n').filter(line => line.trim());
if (lines.length < 2) return match;
@ -269,7 +525,31 @@ function convertMarkdownToAsciidoc(content: string): string { @@ -269,7 +525,31 @@ function convertMarkdownToAsciidoc(content: string): string {
if (!separatorRow.includes('-')) return match;
let tableAsciidoc = '[cols="1,1"]\n|===\n';
// Parse alignment from separator row
// :--- = left, :----: = center, ---: = right, --- = default
const cells = separatorRow.split('|').filter(c => c.trim());
const alignments: string[] = [];
cells.forEach((cell, index) => {
const trimmed = cell.trim();
if (trimmed.startsWith(':') && trimmed.endsWith(':')) {
alignments[index] = '^'; // center (AsciiDoc uses ^ for center)
} else if (trimmed.endsWith(':')) {
alignments[index] = '>'; // right
} else if (trimmed.startsWith(':')) {
alignments[index] = '<'; // left (explicit)
} else {
alignments[index] = '<'; // default left
}
});
// Build cols attribute with alignments
const colsAttr = alignments.length > 0
? `[cols="${alignments.join(',')}"]`
: '';
let tableAsciidoc = colsAttr ? `${colsAttr}\n` : '';
tableAsciidoc += '|===\n';
tableAsciidoc += headerRow + '\n';
dataRows.forEach(row => {
tableAsciidoc += row + '\n';
@ -349,12 +629,14 @@ function processWikilinks(content: string, linkBaseURL: string): string { @@ -349,12 +629,14 @@ function processWikilinks(content: string, linkBaseURL: string): string {
/**
* Processes nostr: addresses
* Only processes addresses with "nostr:" prefix - bare addresses are left as plaintext
* Converts to link:nostr:...[...] format
* Valid bech32 prefixes: npub, nprofile, nevent, naddr, note
*/
function processNostrAddresses(content: string, linkBaseURL: string): string {
// Match nostr: followed by valid bech32 prefix and identifier
// Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers)
// Only match if it has "nostr:" prefix - bare addresses should remain as plaintext
const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi;
return content.replace(nostrPattern, (_match, bech32Id) => {
return `link:nostr:${bech32Id}[${bech32Id}]`;
@ -427,26 +709,93 @@ function processMediaUrls(content: string): string { @@ -427,26 +709,93 @@ function processMediaUrls(content: string): string {
/**
* Processes bare URLs and converts them to AsciiDoc links
* Matches http://, https://, and www. URLs that aren't already in markdown links
* Matches http://, https://, wss://, and www. URLs that aren't already in markdown links
* Also handles bare image URLs (converts to images)
* Skips URLs inside code blocks (---- blocks) and inline code (backticks)
*/
function processBareUrls(content: string): string {
// Protect code blocks and inline code from URL processing
// We'll process URLs, then restore code blocks
const codeBlockPlaceholders: string[] = [];
const inlineCodePlaceholders: string[] = [];
// Replace code blocks with placeholders
content = content.replace(/\[source[^\]]*\]\n----\n([\s\S]*?)\n----/g, (match, code) => {
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Also handle plain code blocks (without [source])
content = content.replace(/----\n([\s\S]*?)\n----/g, (match, code) => {
// Check if this is already a placeholder
if (match.includes('__CODEBLOCK_')) {
return match;
}
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Replace inline code with placeholders
content = content.replace(/`([^`]+)`/g, (match, code) => {
const placeholder = `__INLINECODE_${inlineCodePlaceholders.length}__`;
inlineCodePlaceholders.push(match);
return placeholder;
});
// First, handle bare image URLs (before regular URLs)
// Match image URLs: .jpg, .png, .gif, .webp, .svg, etc.
// Format: image::url[width=100%] - matching jumble's format
const imageUrlPattern = /(?<!\]\()\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+\.(jpe?g|png|gif|webp|svg|bmp|ico))(?:\?[^\s<>"{}|\\^`\[\]()]*)?/gi;
content = content.replace(imageUrlPattern, (match, url) => {
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Don't escape brackets - AsciiDoc handles URLs properly
return `image::${cleanedUrl}[width=100%]`;
});
// Match URLs that aren't already in markdown link format
// Pattern: http://, https://, or www. followed by valid URL characters
// Use negative lookbehind to avoid matching URLs inside parentheses (markdown links)
// Match URLs that are not preceded by ]( (which would be a markdown link)
const urlPattern = /(?<!\]\()\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+|www\.[^\s<>"{}|\\^`\[\]()]+)/gi;
// Pattern: http://, https://, wss://, or www. followed by valid URL characters
// Use word boundary to avoid matching URLs that are part of other text
// Don't match if immediately after colon-space (like "hyperlink: www.example.com")
const urlPattern = /(?<!\]\()(?<!:\s)\b(https?:\/\/[^\s<>"{}|\\^`\[\]()]+|wss:\/\/[^\s<>"{}|\\^`\[\]()]+|www\.[^\s<>"{}|\\^`\[\]()]+)/gi;
return content.replace(urlPattern, (match, url) => {
content = content.replace(urlPattern, (match, url) => {
// Skip if this URL was already converted to an image
if (match.includes('image::')) {
return match;
}
// Ensure URL starts with http:// or https://
let fullUrl = url;
if (url.startsWith('www.')) {
fullUrl = 'https://' + url;
} else if (url.startsWith('wss://')) {
// Convert wss:// to https:// for display
fullUrl = url.replace(/^wss:\/\//, 'https://');
}
// Escape special AsciiDoc characters
const escapedUrl = fullUrl.replace(/([\[\]])/g, '\\$1');
return `link:${escapedUrl}[${url}]`;
// Clean URL (remove tracking parameters)
fullUrl = cleanUrl(fullUrl);
// Don't escape brackets in URLs - AsciiDoc handles them properly
// The URL is in the link: part, brackets in URLs are valid
// Use proper AsciiDoc link syntax: link:url[text]
return `link:${fullUrl}[${url}]`;
});
// Restore inline code
inlineCodePlaceholders.forEach((code, index) => {
content = content.replace(`__INLINECODE_${index}__`, code);
});
// Restore code blocks
codeBlockPlaceholders.forEach((code, index) => {
content = content.replace(`__CODEBLOCK_${index}__`, code);
});
return content;
}
/**

70
src/detector.js

@ -0,0 +1,70 @@ @@ -0,0 +1,70 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.detectFormat = detectFormat;
const types_1 = require("./types");
/**
* Detects the content format based on content patterns
*/
function detectFormat(content) {
// Check for AsciiDoc indicators
const asciidocIndicators = [
'= ', // Title
'== ', // Section
'=== ', // Subsection
'include::', // Include directive
'image::', // Image block
'[source', // Source block
'----', // Listing block
'....', // Literal block
'|===', // Table
'link:', // AsciiDoc link format
'wikilink:', // Wikilink macro
'hashtag:', // Hashtag macro
];
let asciidocScore = 0;
for (const indicator of asciidocIndicators) {
if (content.includes(indicator)) {
asciidocScore++;
}
}
// Check for Wikipedia markup indicators (== Heading == format)
const wikipediaIndicators = [
/^==+\s+.+?\s+==+$/m, // Wikipedia headings: == Heading ==
/\[\[[^\]]+\]\]/, // Wikipedia links: [[Page]]
/''[^']+''/, // Wikipedia bold: ''text''
/'[^']+'/, // Wikipedia italic: 'text'
];
let wikipediaScore = 0;
for (const indicator of wikipediaIndicators) {
if (indicator.test(content)) {
wikipediaScore++;
}
}
// Check for Markdown indicators (more specific patterns to avoid false positives)
const markdownIndicators = [
/^#{1,6}\s+/m, // Heading at start of line
/```[\s\S]*?```/, // Code block
/\*\*[^*]+\*\*/, // Bold text
/^[-*+]\s+/m, // List item at start of line
/!\[[^\]]*\]\([^)]+\)/, // Image syntax
/\[[^\]]+\]\([^)]+\)/, // Link syntax
];
let markdownScore = 0;
for (const indicator of markdownIndicators) {
if (indicator.test(content)) {
markdownScore++;
}
}
// Determine format based on scores
// Wikipedia format takes precedence if detected (it's more specific)
if (wikipediaScore > 0 && wikipediaScore >= 2) {
return types_1.ContentFormat.Wikipedia;
}
else if (asciidocScore > markdownScore && asciidocScore >= 2) {
return types_1.ContentFormat.AsciiDoc;
}
else if (markdownScore > 0) {
return types_1.ContentFormat.Markdown;
}
return types_1.ContentFormat.Plain;
}

160
src/extractors/frontmatter.js

@ -0,0 +1,160 @@ @@ -0,0 +1,160 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractFrontmatter = extractFrontmatter;
/**
* Extracts front matter from content
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value)
* Returns the front matter object and the content
* For YAML: removes front matter from content
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output)
*/
function extractFrontmatter(content) {
// First, try to match YAML front matter: ---\n...\n---
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/;
const yamlMatch = content.match(yamlFrontmatterRegex);
if (yamlMatch) {
const yamlContent = yamlMatch[1];
const contentWithoutFrontmatter = yamlMatch[2];
// Simple YAML parser for basic key-value pairs and arrays
// This is a basic implementation - for complex YAML, consider using a library
const frontmatter = {};
const lines = yamlContent.split('\n');
let currentKey = null;
let inArray = false;
let arrayKey = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmed = line.trim();
// Skip empty lines and comments
if (!trimmed || trimmed.startsWith('#')) {
if (inArray && trimmed === '') {
// Empty line might end the array
inArray = false;
arrayKey = null;
}
continue;
}
// Array item (line starting with -)
if (trimmed.startsWith('- ')) {
const item = trimmed.substring(2).trim();
const cleanItem = item.replace(/^["']|["']$/g, '');
if (arrayKey && frontmatter[arrayKey]) {
frontmatter[arrayKey].push(cleanItem);
}
else if (currentKey) {
// Start new array
arrayKey = currentKey;
inArray = true;
frontmatter[currentKey] = [cleanItem];
}
continue;
}
// Key-value pair
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/);
if (keyValueMatch) {
const key = keyValueMatch[1];
let value = keyValueMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
frontmatter[key] = value;
currentKey = key;
inArray = false;
arrayKey = null;
continue;
}
}
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter };
}
// If no YAML front matter, try to extract AsciiDoc document header attributes
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
// Match header lines until we hit a blank line (which separates header from body)
// The header consists of: title line, optional author/revision lines, and attribute lines
const lines = content.split('\n');
let headerEndIndex = 0;
// Find where the header ends (first blank line after title/attributes)
if (lines[0] && lines[0].match(/^=+\s+/)) {
// We have a title line, now find where header ends
let i = 1;
// Skip author and revision lines (non-empty lines that don't start with :)
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) {
i++;
}
// Now skip attribute lines (lines starting with :)
while (i < lines.length && lines[i].trim().startsWith(':')) {
i++;
}
// Skip the blank line that separates header from body
if (i < lines.length && lines[i].trim() === '') {
i++;
}
headerEndIndex = i;
}
// If we found a header, extract it
if (headerEndIndex > 0) {
const headerLines = lines.slice(0, headerEndIndex);
const headerContent = headerLines.join('\n');
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n');
const frontmatter = {};
const headerLinesArray = headerContent.split('\n');
// Extract title (first line starting with =)
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/);
if (titleMatch) {
frontmatter.title = titleMatch[1].trim();
}
// Extract author (line after title, if it doesn't start with :)
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) {
const authorLine = headerLinesArray[1].trim();
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) {
// Not a revision line (which has numbers, commas, colons)
frontmatter.author = authorLine;
}
}
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
for (let i = 1; i < headerLinesArray.length; i++) {
const line = headerLinesArray[i].trim();
if (line.match(/^[\d.,\s:]+$/)) {
// This looks like a revision line
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/);
if (revisionMatch) {
frontmatter.version = revisionMatch[1].trim();
frontmatter.date = revisionMatch[2].trim();
if (revisionMatch[3]) {
frontmatter.revision = revisionMatch[3].trim();
}
}
break;
}
}
// Extract AsciiDoc attributes (:key: value)
for (const line of headerLinesArray) {
const trimmed = line.trim();
if (trimmed.startsWith(':') && trimmed.includes(':')) {
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/);
if (attrMatch) {
const key = attrMatch[1].trim();
let value = attrMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
// Handle comma-separated values (like keywords)
if (value.includes(',') && !value.includes(' ')) {
frontmatter[key] = value.split(',').map((v) => v.trim());
}
else {
frontmatter[key] = value;
}
}
}
}
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
// AsciiDoctor can work without the header, and we've already extracted the metadata
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader };
}
// No front matter found
return { content };
}

177
src/extractors/frontmatter.ts

@ -0,0 +1,177 @@ @@ -0,0 +1,177 @@
/**
* Extracts front matter from content
* Handles both YAML front matter (--- ... ---) and AsciiDoc document header attributes (:key: value)
* Returns the front matter object and the content
* For YAML: removes front matter from content
* For AsciiDoc: removes header from content and extracts as metadata (prevents header from appearing in rendered output)
*/
export function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } {
// First, try to match YAML front matter: ---\n...\n---
const yamlFrontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n?([\s\S]*)$/;
const yamlMatch = content.match(yamlFrontmatterRegex);
if (yamlMatch) {
const yamlContent = yamlMatch[1];
const contentWithoutFrontmatter = yamlMatch[2];
// Simple YAML parser for basic key-value pairs and arrays
// This is a basic implementation - for complex YAML, consider using a library
const frontmatter: Record<string, any> = {};
const lines = yamlContent.split('\n');
let currentKey: string | null = null;
let inArray = false;
let arrayKey: string | null = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmed = line.trim();
// Skip empty lines and comments
if (!trimmed || trimmed.startsWith('#')) {
if (inArray && trimmed === '') {
// Empty line might end the array
inArray = false;
arrayKey = null;
}
continue;
}
// Array item (line starting with -)
if (trimmed.startsWith('- ')) {
const item = trimmed.substring(2).trim();
const cleanItem = item.replace(/^["']|["']$/g, '');
if (arrayKey && frontmatter[arrayKey]) {
frontmatter[arrayKey].push(cleanItem);
} else if (currentKey) {
// Start new array
arrayKey = currentKey;
inArray = true;
frontmatter[currentKey] = [cleanItem];
}
continue;
}
// Key-value pair
const keyValueMatch = trimmed.match(/^(\w+):\s*(.+)$/);
if (keyValueMatch) {
const key = keyValueMatch[1];
let value = keyValueMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
frontmatter[key] = value;
currentKey = key;
inArray = false;
arrayKey = null;
continue;
}
}
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutFrontmatter };
}
// If no YAML front matter, try to extract AsciiDoc document header attributes
// AsciiDoc format: = Title\nAuthor\nRevision\n:attribute: value\n...
// Match header lines until we hit a blank line (which separates header from body)
// The header consists of: title line, optional author/revision lines, and attribute lines
const lines = content.split('\n');
let headerEndIndex = 0;
// Find where the header ends (first blank line after title/attributes)
if (lines[0] && lines[0].match(/^=+\s+/)) {
// We have a title line, now find where header ends
let i = 1;
// Skip author and revision lines (non-empty lines that don't start with :)
while (i < lines.length && lines[i].trim() && !lines[i].trim().startsWith(':')) {
i++;
}
// Now skip attribute lines (lines starting with :)
while (i < lines.length && lines[i].trim().startsWith(':')) {
i++;
}
// Skip the blank line that separates header from body
if (i < lines.length && lines[i].trim() === '') {
i++;
}
headerEndIndex = i;
}
// If we found a header, extract it
if (headerEndIndex > 0) {
const headerLines = lines.slice(0, headerEndIndex);
const headerContent = headerLines.join('\n');
const contentWithoutHeader = lines.slice(headerEndIndex).join('\n');
const frontmatter: Record<string, any> = {};
const headerLinesArray = headerContent.split('\n');
// Extract title (first line starting with =)
const titleMatch = headerLinesArray[0].match(/^=+\s+(.+)$/);
if (titleMatch) {
frontmatter.title = titleMatch[1].trim();
}
// Extract author (line after title, if it doesn't start with :)
if (headerLinesArray.length > 1 && !headerLinesArray[1].trim().startsWith(':')) {
const authorLine = headerLinesArray[1].trim();
if (authorLine && !authorLine.match(/^[\d.,\s:]+$/)) {
// Not a revision line (which has numbers, commas, colons)
frontmatter.author = authorLine;
}
}
// Extract revision (line with version, date, remark format: "2.9, October 31, 2021: Fall incarnation")
for (let i = 1; i < headerLinesArray.length; i++) {
const line = headerLinesArray[i].trim();
if (line.match(/^[\d.,\s:]+$/)) {
// This looks like a revision line
const revisionMatch = line.match(/^([^,]+),\s*([^:]+)(?::\s*(.+))?$/);
if (revisionMatch) {
frontmatter.version = revisionMatch[1].trim();
frontmatter.date = revisionMatch[2].trim();
if (revisionMatch[3]) {
frontmatter.revision = revisionMatch[3].trim();
}
}
break;
}
}
// Extract AsciiDoc attributes (:key: value)
for (const line of headerLinesArray) {
const trimmed = line.trim();
if (trimmed.startsWith(':') && trimmed.includes(':')) {
const attrMatch = trimmed.match(/^:([^:]+):\s*(.+)$/);
if (attrMatch) {
const key = attrMatch[1].trim();
let value = attrMatch[2].trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
// Handle comma-separated values (like keywords)
if (value.includes(',') && !value.includes(' ')) {
frontmatter[key] = value.split(',').map((v: string) => v.trim());
} else {
frontmatter[key] = value;
}
}
}
}
// For AsciiDoc, remove the header from content to prevent it from appearing in rendered output
// AsciiDoctor can work without the header, and we've already extracted the metadata
return { frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined, content: contentWithoutHeader };
}
// No front matter found
return { content };
}

243
src/extractors/metadata.js

@ -0,0 +1,243 @@ @@ -0,0 +1,243 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractMetadata = extractMetadata;
/**
* Extracts metadata from content before processing
*/
function extractMetadata(content, linkBaseURL) {
return {
nostrLinks: extractNostrLinks(content),
wikilinks: extractWikilinks(content),
hashtags: extractHashtags(content),
links: extractLinks(content, linkBaseURL),
media: extractMedia(content),
};
}
/**
* Extract Nostr links from content
*/
function extractNostrLinks(content) {
const nostrLinks = [];
const seen = new Set();
// Extract nostr: prefixed links (valid bech32 format)
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || [];
nostrMatches.forEach(match => {
const id = match.substring(6); // Remove 'nostr:'
const type = getNostrType(id);
if (type && !seen.has(id)) {
seen.add(id);
nostrLinks.push({
type,
id,
text: match,
bech32: id,
});
}
});
return nostrLinks;
}
/**
* Extract wikilinks from content
*/
function extractWikilinks(content) {
const wikilinks = [];
const seen = new Set();
// Match [[target]] or [[target|display]]
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g;
let match;
while ((match = wikilinkPattern.exec(content)) !== null) {
const target = match[1].trim();
const display = match[2] ? match[2].trim() : target;
const dtag = normalizeDtag(target);
const key = `${dtag}|${display}`;
if (!seen.has(key)) {
seen.add(key);
wikilinks.push({
dtag,
display,
original: match[0],
});
}
}
return wikilinks;
}
/**
* Extract hashtags from content
* Excludes hashtags in URLs, code blocks, and inline code
*/
function extractHashtags(content) {
const hashtags = [];
const seen = new Set();
// Remove code blocks first to avoid matching inside them
const codeBlockPattern = /```[\s\S]*?```/g;
const inlineCodePattern = /`[^`]+`/g;
const urlPattern = /https?:\/\/[^\s<>"']+/g;
let processedContent = content
.replace(codeBlockPattern, '') // Remove code blocks
.replace(inlineCodePattern, '') // Remove inline code
.replace(urlPattern, ''); // Remove URLs
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g;
let match;
while ((match = hashtagPattern.exec(processedContent)) !== null) {
const tag = match[1].toLowerCase();
if (!seen.has(tag)) {
hashtags.push(tag);
seen.add(tag);
}
}
return hashtags;
}
/**
* Extract regular links from content
*/
function extractLinks(content, linkBaseURL) {
const links = [];
const seen = new Set();
// Extract markdown links: [text](url) - optimized to avoid double matching
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
let markdownMatch;
while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) {
const [, text, url] = markdownMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g;
let asciidocMatch;
while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) {
const [, url, text] = asciidocMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract raw URLs (basic pattern)
const urlPattern = /https?:\/\/[^\s<>"']+/g;
const rawUrls = content.match(urlPattern) || [];
rawUrls.forEach(url => {
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text: url,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
});
return links;
}
/**
* Extract media URLs from content
*/
function extractMedia(content) {
const media = [];
const seen = new Set();
// Extract markdown images: ![alt](url) - optimized to avoid double matching
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
let markdownImageMatch;
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) {
const url = markdownImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
const asciidocImagePattern = /image::([^\[]+)\[/g;
let asciidocImageMatch;
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) {
const url = asciidocImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract raw image/video URLs
const urlPattern = /https?:\/\/[^\s<>"']+/g;
const rawUrls = content.match(urlPattern) || [];
rawUrls.forEach(url => {
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) {
media.push(url);
seen.add(url);
}
});
return media;
}
/**
* Get Nostr identifier type
*/
function getNostrType(id) {
if (id.startsWith('npub'))
return 'npub';
if (id.startsWith('nprofile'))
return 'nprofile';
if (id.startsWith('nevent'))
return 'nevent';
if (id.startsWith('naddr'))
return 'naddr';
if (id.startsWith('note'))
return 'note';
return null;
}
/**
* Normalize text to d-tag format
*/
function normalizeDtag(text) {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Check if URL is external
*/
function isExternalUrl(url, linkBaseURL) {
if (!linkBaseURL)
return true;
try {
// Use a simple string-based check for Node.js compatibility
// Extract hostname from URL string
const urlMatch = url.match(/^https?:\/\/([^\/]+)/);
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch && baseMatch) {
return urlMatch[1] !== baseMatch[1];
}
return true;
}
catch {
return true;
}
}
/**
* Check if URL is a Nostr URL
*/
function isNostrUrl(url) {
return url.startsWith('nostr:') || getNostrType(url) !== null;
}
/**
* Check if URL is an image
*/
function isImageUrl(url) {
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url);
}
/**
* Check if URL is a video
*/
function isVideoUrl(url) {
return /\.(mp4|webm|ogg)$/i.test(url);
}

92
src/parser.js

@ -0,0 +1,92 @@ @@ -0,0 +1,92 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Parser = void 0;
exports.defaultOptions = defaultOptions;
exports.process = process;
const detector_1 = require("./detector");
const to_asciidoc_1 = require("./converters/to-asciidoc");
const asciidoc_1 = require("./processors/asciidoc");
const metadata_1 = require("./extractors/metadata");
const frontmatter_1 = require("./extractors/frontmatter");
/**
* Default parser options
*/
function defaultOptions() {
return {
linkBaseURL: '',
enableAsciiDoc: true,
enableMarkdown: true,
enableCodeHighlighting: true,
enableLaTeX: true,
enableMusicalNotation: true,
enableNostrAddresses: true,
};
}
/**
* Main parser for Nostr event content
* Handles multiple content formats: AsciiDoc, Markdown, code syntax,
* LaTeX, musical notation, and nostr: prefixed addresses
*
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
class Parser {
constructor(options = {}) {
const defaults = defaultOptions();
this.options = {
linkBaseURL: options.linkBaseURL ?? defaults.linkBaseURL ?? '',
enableAsciiDoc: options.enableAsciiDoc ?? defaults.enableAsciiDoc ?? true,
enableMarkdown: options.enableMarkdown ?? defaults.enableMarkdown ?? true,
enableCodeHighlighting: options.enableCodeHighlighting ?? defaults.enableCodeHighlighting ?? true,
enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true,
enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true,
enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true,
wikilinkUrl: options.wikilinkUrl ?? defaults.wikilinkUrl,
hashtagUrl: options.hashtagUrl ?? defaults.hashtagUrl,
};
}
/**
* Process Nostr event content and return HTML
* Automatically detects the content format and processes accordingly
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
async process(content) {
// Extract frontmatter first (before any other processing)
const { frontmatter, content: contentWithoutFrontmatter } = (0, frontmatter_1.extractFrontmatter)(content);
// Extract metadata from content (after removing frontmatter)
const metadata = (0, metadata_1.extractMetadata)(contentWithoutFrontmatter, this.options.linkBaseURL);
// Detect content format (on content without frontmatter)
const format = (0, detector_1.detectFormat)(contentWithoutFrontmatter);
// Convert everything to AsciiDoc format first
const asciidocContent = (0, to_asciidoc_1.convertToAsciidoc)(contentWithoutFrontmatter, format, this.options.linkBaseURL, {
enableNostrAddresses: this.options.enableNostrAddresses,
});
// Process through AsciiDoctor
const result = await (0, asciidoc_1.processAsciidoc)(asciidocContent, {
enableCodeHighlighting: this.options.enableCodeHighlighting,
enableLaTeX: this.options.enableLaTeX,
enableMusicalNotation: this.options.enableMusicalNotation,
originalContent: contentWithoutFrontmatter, // Pass original for LaTeX detection
linkBaseURL: this.options.linkBaseURL, // Pass linkBaseURL for link processing
wikilinkUrl: this.options.wikilinkUrl, // Pass wikilink URL format
hashtagUrl: this.options.hashtagUrl, // Pass hashtag URL format
});
// Combine with extracted metadata and frontmatter
return {
...result,
frontmatter,
nostrLinks: metadata.nostrLinks,
wikilinks: metadata.wikilinks,
hashtags: metadata.hashtags,
links: metadata.links,
media: metadata.media,
};
}
}
exports.Parser = Parser;
/**
* Convenience function to process content with default options
*/
async function process(content, options) {
const parser = new Parser(options);
return parser.process(content);
}

25
src/parser.ts

@ -3,6 +3,7 @@ import { detectFormat } from './detector'; @@ -3,6 +3,7 @@ import { detectFormat } from './detector';
import { convertToAsciidoc } from './converters/to-asciidoc';
import { processAsciidoc } from './processors/asciidoc';
import { extractMetadata } from './extractors/metadata';
import { extractFrontmatter } from './extractors/frontmatter';
/**
* Default parser options
@ -27,7 +28,7 @@ export function defaultOptions(): ParserOptions { @@ -27,7 +28,7 @@ export function defaultOptions(): ParserOptions {
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
export class Parser {
private options: Required<ParserOptions>;
private options: Required<Omit<ParserOptions, 'wikilinkUrl' | 'hashtagUrl'>> & Pick<ParserOptions, 'wikilinkUrl' | 'hashtagUrl'>;
constructor(options: ParserOptions = {}) {
const defaults = defaultOptions();
@ -39,6 +40,8 @@ export class Parser { @@ -39,6 +40,8 @@ export class Parser {
enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true,
enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true,
enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true,
wikilinkUrl: options.wikilinkUrl ?? defaults.wikilinkUrl,
hashtagUrl: options.hashtagUrl ?? defaults.hashtagUrl,
};
}
@ -48,15 +51,18 @@ export class Parser { @@ -48,15 +51,18 @@ export class Parser {
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
async process(content: string): Promise<ProcessResult> {
// Extract metadata from original content (before conversion)
const metadata = extractMetadata(content, this.options.linkBaseURL);
// Extract frontmatter first (before any other processing)
const { frontmatter, content: contentWithoutFrontmatter } = extractFrontmatter(content);
// Extract metadata from content (after removing frontmatter)
const metadata = extractMetadata(contentWithoutFrontmatter, this.options.linkBaseURL);
// Detect content format
const format = detectFormat(content);
// Detect content format (on content without frontmatter)
const format = detectFormat(contentWithoutFrontmatter);
// Convert everything to AsciiDoc format first
const asciidocContent = convertToAsciidoc(
content,
contentWithoutFrontmatter,
format,
this.options.linkBaseURL,
{
@ -71,14 +77,17 @@ export class Parser { @@ -71,14 +77,17 @@ export class Parser {
enableCodeHighlighting: this.options.enableCodeHighlighting,
enableLaTeX: this.options.enableLaTeX,
enableMusicalNotation: this.options.enableMusicalNotation,
originalContent: content, // Pass original for LaTeX detection
originalContent: contentWithoutFrontmatter, // Pass original for LaTeX detection
linkBaseURL: this.options.linkBaseURL, // Pass linkBaseURL for link processing
wikilinkUrl: this.options.wikilinkUrl, // Pass wikilink URL format
hashtagUrl: this.options.hashtagUrl, // Pass hashtag URL format
}
);
// Combine with extracted metadata
// Combine with extracted metadata and frontmatter
return {
...result,
frontmatter,
nostrLinks: metadata.nostrLinks,
wikilinks: metadata.wikilinks,
hashtags: metadata.hashtags,

148
src/processors/asciidoc.js

@ -0,0 +1,148 @@ @@ -0,0 +1,148 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.processAsciidoc = processAsciidoc;
const core_1 = __importDefault(require("@asciidoctor/core"));
const html_utils_1 = require("./html-utils");
const html_postprocess_1 = require("./html-postprocess");
const asciidoctorInstance = (0, core_1.default)();
/**
* Processes AsciiDoc content to HTML using AsciiDoctor
* Uses AsciiDoctor's built-in highlight.js and LaTeX support
*/
async function processAsciidoc(content, options = {}) {
const { enableCodeHighlighting = true, enableLaTeX = true, enableMusicalNotation = true, } = options;
// Check if content starts with level 3+ headers
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
// If content starts with level 3+, use book doctype
const firstHeaderMatch = content.match(/^(={1,6})\s+/m);
let doctype = 'article';
if (firstHeaderMatch) {
const firstHeaderLevel = firstHeaderMatch[1].length;
if (firstHeaderLevel >= 3) {
doctype = 'book';
}
}
try {
const result = asciidoctorInstance.convert(content, {
safe: 'safe',
backend: 'html5',
doctype: doctype,
attributes: {
'showtitle': true,
'sectanchors': true,
'sectlinks': true,
'toc': 'left',
'toclevels': 6,
'toc-title': 'Table of Contents',
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none',
'stem': enableLaTeX ? 'latexmath' : 'none',
'plantuml': 'plantuml', // Enable PlantUML diagram support
'data-uri': true,
'imagesdir': '',
'linkcss': false,
'stylesheet': '',
'stylesdir': '',
'prewrap': true,
'sectnums': false,
'sectnumlevels': 6,
'experimental': true,
'compat-mode': false,
'attribute-missing': 'warn',
'attribute-undefined': 'warn',
'skip-front-matter': true,
'source-indent': 0,
'indent': 0,
'tabsize': 2,
'tabwidth': 2,
'hardbreaks': false,
'paragraph-rewrite': 'normal',
'sectids': true,
'idprefix': '',
'idseparator': '-',
'sectidprefix': '',
'sectidseparator': '-'
}
});
const htmlString = typeof result === 'string' ? result : result.toString();
// Extract table of contents from HTML
const { toc, contentWithoutTOC } = (0, html_utils_1.extractTOC)(htmlString);
// Sanitize HTML to prevent XSS
const sanitized = (0, html_utils_1.sanitizeHTML)(contentWithoutTOC);
// Post-process HTML: convert macros to HTML, add styling, etc.
const processed = (0, html_postprocess_1.postProcessHtml)(sanitized, {
enableMusicalNotation,
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
// Process links: add target="_blank" to external links
const processedWithLinks = options.linkBaseURL
? (0, html_utils_1.processLinks)(processed, options.linkBaseURL)
: processed;
// Also process TOC
const tocSanitized = (0, html_utils_1.sanitizeHTML)(toc);
const tocProcessed = (0, html_postprocess_1.postProcessHtml)(tocSanitized, {
enableMusicalNotation: false, // Don't process music in TOC
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
// Process links in TOC as well
const tocProcessedWithLinks = options.linkBaseURL
? (0, html_utils_1.processLinks)(tocProcessed, options.linkBaseURL)
: tocProcessed;
// Check for LaTeX in original content (more reliable than checking HTML)
const contentToCheck = options.originalContent || content;
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck);
// Check for musical notation in processed HTML
const hasMusicalNotation = enableMusicalNotation && (/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed));
return {
content: processedWithLinks,
tableOfContents: tocProcessedWithLinks,
hasLaTeX,
hasMusicalNotation,
nostrLinks: [], // Will be populated by metadata extraction
wikilinks: [],
hashtags: [],
links: [],
media: [],
};
}
catch (error) {
// Fallback to plain text with error logging
const errorMessage = error instanceof Error ? error.message : String(error);
// Use process.stderr.write for Node.js compatibility instead of console.error
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const nodeProcess = globalThis.process;
if (nodeProcess?.stderr) {
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`);
}
// Escape HTML in content for safe display
const escapedContent = (0, html_utils_1.sanitizeHTML)(content);
return {
content: `<p>${escapedContent}</p>`,
tableOfContents: '',
hasLaTeX: false,
hasMusicalNotation: false,
nostrLinks: [],
wikilinks: [],
hashtags: [],
links: [],
media: [],
};
}
}
/**
* Check if content has LaTeX math
* Based on jumble's detection pattern
*/
function hasMathContent(content) {
// Check for inline math: $...$ or \(...\)
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content);
// Check for block math: $$...$$ or \[...\]
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content);
return inlineMath || blockMath;
}

7
src/processors/asciidoc.ts

@ -11,6 +11,8 @@ export interface ProcessOptions { @@ -11,6 +11,8 @@ export interface ProcessOptions {
enableMusicalNotation?: boolean;
originalContent?: string; // Original content for LaTeX detection
linkBaseURL?: string; // Base URL for link processing
wikilinkUrl?: string | ((dtag: string) => string); // Custom URL format for wikilinks
hashtagUrl?: string | ((topic: string) => string); // Custom URL format for hashtags
}
/**
@ -54,6 +56,7 @@ export async function processAsciidoc( @@ -54,6 +56,7 @@ export async function processAsciidoc(
'toc-title': 'Table of Contents',
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none',
'stem': enableLaTeX ? 'latexmath' : 'none',
'plantuml': 'plantuml', // Enable PlantUML diagram support
'data-uri': true,
'imagesdir': '',
'linkcss': false,
@ -93,6 +96,8 @@ export async function processAsciidoc( @@ -93,6 +96,8 @@ export async function processAsciidoc(
const processed = postProcessHtml(sanitized, {
enableMusicalNotation,
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
// Process links: add target="_blank" to external links
@ -105,6 +110,8 @@ export async function processAsciidoc( @@ -105,6 +110,8 @@ export async function processAsciidoc(
const tocProcessed = postProcessHtml(tocSanitized, {
enableMusicalNotation: false, // Don't process music in TOC
linkBaseURL: options.linkBaseURL,
wikilinkUrl: options.wikilinkUrl,
hashtagUrl: options.hashtagUrl,
});
// Process links in TOC as well

594
src/processors/html-postprocess.js

@ -0,0 +1,594 @@ @@ -0,0 +1,594 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.postProcessHtml = postProcessHtml;
const music_1 = require("./music");
/**
* Post-processes HTML output from AsciiDoctor
* Converts AsciiDoc macros to HTML with data attributes and CSS classes
*/
function postProcessHtml(html, options = {}) {
let processed = html;
// Convert bookstr markers to HTML placeholders
processed = processed.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => {
const escaped = bookContent.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`;
});
// Convert hashtag links to HTML
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
// HTML escape the display text
const escapedDisplay = displayText
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// If hashtagUrl is configured, make it a clickable link
if (options.hashtagUrl) {
let url;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(normalizedHashtag);
}
else {
// String template with {topic} placeholder
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag);
}
// Escape URL for HTML attribute
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${normalizedHashtag.replace(/"/g, '&quot;')}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
}
else {
// Default: Use span instead of <a> tag - same color as links but no underline and not clickable
return `<span class="hashtag-link">${escapedDisplay}</span>`;
}
});
// Convert WIKILINK:dtag|display placeholder format to HTML
// Match WIKILINK:dtag|display, ensuring we don't match across HTML tags
processed = processed.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => {
const escapedDtag = dTag.trim().replace(/"/g, '&quot;');
const escapedDisplay = displayText.trim()
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Generate URL using custom format or default
let url;
if (options.wikilinkUrl) {
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(dTag.trim());
}
else {
// String template with {dtag} placeholder
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim());
}
}
else {
// Default format
url = `/events?d=${escapedDtag}`;
}
// Escape URL for HTML attribute
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
});
// Convert nostr: links to HTML
processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => {
const nostrType = getNostrType(bech32Id);
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') {
// Render as embedded event placeholder
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`;
}
else if (nostrType === 'npub' || nostrType === 'nprofile') {
// Render as user handle
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<span class="user-handle" data-pubkey="${escaped}">@${displayText}</span>`;
}
else {
// Fallback to regular link
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${displayText}</a>`;
}
});
// Convert any leftover link: macros that AsciiDoctor didn't convert
// This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars)
// Pattern: link:url[text] where url is http/https and text can contain any characters
processed = processed.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => {
// Escape URL and text for HTML attributes
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
const escapedText = text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
const isRelayUrl = /wss?:\/\//i.test(text);
if (isRelayUrl) {
// Simple link without OpenGraph wrapper
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
}
else {
// Regular link - will be processed by OpenGraph handler if external
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
}
});
// Process media URLs (YouTube, Spotify, video, audio)
processed = processMedia(processed);
// Process OpenGraph links (external links that should have rich previews)
processed = processOpenGraphLinks(processed, options.linkBaseURL);
// Process images: add max-width styling and data attributes
processed = processImages(processed);
// Process musical notation if enabled
if (options.enableMusicalNotation) {
processed = (0, music_1.processMusicalNotation)(processed);
}
// Clean up any leftover markdown syntax
processed = cleanupMarkdown(processed);
// Add styling classes
processed = addStylingClasses(processed);
// Hide raw ToC text
processed = hideRawTocText(processed);
return processed;
}
/**
* Get Nostr identifier type
*/
function getNostrType(id) {
if (id.startsWith('npub'))
return 'npub';
if (id.startsWith('nprofile'))
return 'nprofile';
if (id.startsWith('nevent'))
return 'nevent';
if (id.startsWith('naddr'))
return 'naddr';
if (id.startsWith('note'))
return 'note';
return null;
}
/**
* Process media URLs (YouTube, Spotify, video, audio)
* Converts MEDIA: placeholders to HTML embeds/players
*/
function processMedia(html) {
let processed = html;
// Process YouTube embeds
processed = processed.replace(/MEDIA:youtube:([a-zA-Z0-9_-]+)/g, (_match, videoId) => {
const escapedId = videoId.replace(/"/g, '&quot;');
return `<div class="media-embed youtube-embed" style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; max-width: 100%; margin: 1rem 0;">
<iframe
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
src="https://www.youtube.com/embed/${escapedId}"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
allowfullscreen
loading="lazy">
</iframe>
</div>`;
});
// Process Spotify embeds
processed = processed.replace(/MEDIA:spotify:(track|album|playlist|artist|episode|show):([a-zA-Z0-9]+)/g, (_match, type, id) => {
const escapedType = type.replace(/"/g, '&quot;');
const escapedId = id.replace(/"/g, '&quot;');
return `<div class="media-embed spotify-embed" style="margin: 1rem 0;">
<iframe
style="border-radius: 12px; width: 100%; max-width: 100%;"
src="https://open.spotify.com/embed/${escapedType}/${escapedId}?utm_source=generator"
width="100%"
height="352"
frameborder="0"
allowfullscreen=""
allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"
loading="lazy">
</iframe>
</div>`;
});
// Process video files
processed = processed.replace(/MEDIA:video:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
const escapedUrl = url
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
return `<div class="media-embed video-embed" style="margin: 1rem 0;">
<video
controls
preload="metadata"
style="width: 100%; max-width: 100%; height: auto; border-radius: 8px;"
class="media-player">
<source src="${escapedUrl}" type="video/mp4">
Your browser does not support the video tag.
</video>
</div>`;
});
// Process audio files
processed = processed.replace(/MEDIA:audio:(https?:\/\/[^\s<>"{}|\\^`\[\]()]+)/g, (_match, url) => {
const escapedUrl = url
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
return `<div class="media-embed audio-embed" style="margin: 1rem 0;">
<audio
controls
preload="metadata"
style="width: 100%; max-width: 100%;"
class="media-player">
<source src="${escapedUrl}">
Your browser does not support the audio tag.
</audio>
</div>`;
});
return processed;
}
/**
* Process OpenGraph links - mark external links for OpenGraph preview fetching
*/
function processOpenGraphLinks(html, linkBaseURL) {
// First, clean up any corrupted HTML fragments that might interfere
// Remove "link:" prefixes that appear before links (AsciiDoc syntax that shouldn't be in HTML)
// This happens when AsciiDoctor doesn't fully convert link:url[text] syntax or when
// there's literal text like "should render like link:" before an anchor tag
let processed = html;
// Remove "link:" that appears immediately before anchor tags (most common case)
// Match "link:" followed by optional whitespace and then <a
processed = processed.replace(/link:\s*<a/gi, '<a');
// Remove "link:" that appears as plain text in HTML (shouldn't be there)
// Be careful not to match "link:" inside HTML attributes or tags
// Match "link:" that's not inside quotes or tags
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2');
// Also handle cases where "link:" appears with whitespace before anchor tags
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' ');
// Clean up any corrupted href attributes that contain HTML fragments
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => {
// If href contains HTML tags, extract just the URL part
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i);
if (urlMatch) {
return `href="${urlMatch[1]}"`;
}
return match; // If we can't fix it, leave it (will be skipped by validation)
});
// Clean up any malformed anchor tag fragments that might cause issues
processed = processed.replace(/<a\s+href=["']([^"'>]*<[^"'>]*)["']/gi, (match, corruptedHref) => {
// Skip corrupted anchor tags - they'll be handled by the main regex with validation
return match;
});
// Clean up links inside code blocks - AsciiDoctor creates them but they should be plain text
// Remove <a> tags inside <code> blocks, keeping only the link text
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match, content) => {
// Remove any <a> tags inside code blocks, keeping only the text content
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return `<code>${cleaned}</code>`;
});
// Also clean up links inside pre blocks
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match, content) => {
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return `<pre>${cleaned}</pre>`;
});
// Now protect code blocks and pre blocks by replacing them with placeholders
const codeBlockPlaceholders = [];
const preBlockPlaceholders = [];
// Replace pre blocks first (they can contain code blocks)
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => {
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`;
preBlockPlaceholders.push(match);
return placeholder;
});
// Replace code blocks
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => {
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Extract base domain from linkBaseURL if provided
let baseDomain = null;
if (linkBaseURL) {
try {
const urlMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch) {
baseDomain = urlMatch[1];
}
}
catch {
// Ignore parsing errors
}
}
// Before processing, remove any corrupted opengraph containers that might have been created
// These have malformed data-og-url attributes containing HTML fragments
// Match all spans with data-og-url and check if they're corrupted
// Use a pattern that matches spans with data-og-url, then check the attribute value
processed = processed.replace(/<span[^>]*data-og-url=["']([^"']+)["'][^>]*>[\s\S]*?<\/span>/gi, (match) => {
// This span has a corrupted data-og-url (contains <)
// Extract the clean URL from the beginning of the attribute value
const dataOgUrlMatch = match.match(/data-og-url=["']([^"']+)["']/i);
if (dataOgUrlMatch && dataOgUrlMatch[1]) {
// Extract just the URL part (everything before the first <)
const urlMatch = dataOgUrlMatch[1].match(/(https?:\/\/[^\s<>"']+)/i);
if (urlMatch) {
const cleanUrl = urlMatch[1];
// Extract the link text from inside the span
const linkMatch = match.match(/<a[^>]*>(.*?)<\/a>/i);
const linkText = linkMatch ? linkMatch[1] : cleanUrl;
// Return a clean opengraph container with the fixed URL
const escapedUrl = cleanUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<div class="opengraph-preview" data-og-loading="true" style="display: none;">
<div class="opengraph-card">
<div class="opengraph-image-container">
<img class="opengraph-image" src="" alt="" style="display: none;" />
</div>
<div class="opengraph-content">
<div class="opengraph-site"></div>
<div class="opengraph-title"></div>
<div class="opengraph-description"></div>
</div>
</div>
</div>
</span>`;
}
// If we can't extract a clean URL, just remove the corrupted span and keep any text
const textMatch = match.match(/>([^<]+)</);
return textMatch ? textMatch[1] : '';
}
return match; // Keep valid spans
});
// Match external links (http/https) that aren't media, nostr, or wikilinks
// Skip links that are already in media embeds or special containers
// Use a stricter regex that only matches valid, complete anchor tags
// The regex must match a complete <a> tag with proper structure
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => {
// CRITICAL: Validate href FIRST - if it contains ANY HTML tags or fragments, skip immediately
// This prevents corrupted HTML from being created
if (!href) {
return match; // Skip if no href
}
// Skip if href contains HTML tags or looks corrupted - be very strict
// Check for common HTML fragments that indicate corruption
if (href.includes('<') || href.includes('>') || href.includes('href=') || href.includes('</a>') || href.includes('<a') || href.includes('"') || href.includes("'")) {
return match; // Skip if href looks corrupted
}
// Additional validation: href should only contain URL-safe characters
// URLs shouldn't contain unescaped quotes or HTML tags
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
return match; // Skip if href doesn't match clean URL pattern
}
// Validate href is a proper URL (starts with http:// or https:// and doesn't contain invalid chars)
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
return match; // Skip if href doesn't match URL pattern
}
// Skip if the match contains unclosed tags or corrupted HTML
const openATags = (match.match(/<a\s/g) || []).length;
const closeATags = (match.match(/<\/a>/g) || []).length;
if (openATags !== closeATags || openATags !== 1) {
return match; // Multiple or mismatched <a> tags = corrupted
}
// Skip if match contains nested HTML that looks corrupted
if (match.includes('href="') && match.split('href="').length > 2) {
return match; // Multiple href attributes = corrupted
}
// Skip if it's already a media embed, nostr link, wikilink, or opengraph link
if (match.includes('class="wikilink"') ||
match.includes('class="nostr-link"') ||
match.includes('class="opengraph-link"') ||
match.includes('data-embedded-note') ||
match.includes('youtube-embed') ||
match.includes('spotify-embed') ||
match.includes('media-embed') ||
match.includes('opengraph-link-container')) {
return match;
}
// Skip if it's a media file URL
if (/\.(mp4|webm|ogg|m4v|mov|avi|mkv|flv|wmv|mp3|m4a|wav|flac|aac|opus|wma|jpeg|jpg|png|gif|webp|svg)$/i.test(href)) {
return match;
}
// Skip if it's YouTube or Spotify (already handled as media)
if (/youtube\.com|youtu\.be|spotify\.com/i.test(href)) {
return match;
}
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
// They don't need OpenGraph previews
if (/wss?:\/\//i.test(linkText)) {
return match;
}
// Check if it's an external link (not same domain)
let isExternal = true;
if (baseDomain) {
try {
const hrefMatch = href.match(/^https?:\/\/([^\/]+)/);
if (hrefMatch && hrefMatch[1] === baseDomain) {
isExternal = false;
}
}
catch {
// If parsing fails, assume external
}
}
// Only process external links
if (!isExternal) {
return match;
}
// Escape the URL for data attribute
const escapedUrl = href
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Add data attribute for OpenGraph fetching and wrap in container
// The actual OpenGraph fetching will be done client-side via JavaScript
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<div class="opengraph-preview" data-og-loading="true" style="display: none;">
<div class="opengraph-card">
<div class="opengraph-image-container">
<img class="opengraph-image" src="" alt="" style="display: none;" />
</div>
<div class="opengraph-content">
<div class="opengraph-site"></div>
<div class="opengraph-title"></div>
<div class="opengraph-description"></div>
</div>
</div>
</div>
</span>`;
});
// Restore code blocks
codeBlockPlaceholders.forEach((codeBlock, index) => {
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock);
});
// Restore pre blocks
preBlockPlaceholders.forEach((preBlock, index) => {
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock);
});
return processed;
}
/**
* Process images: add max-width styling and data attributes
*/
function processImages(html) {
const imageUrls = [];
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
let match;
while ((match = imageUrlRegex.exec(html)) !== null) {
const url = match[1];
if (url && !imageUrls.includes(url)) {
imageUrls.push(url);
}
}
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => {
const srcMatch = attributes.match(/src=["']([^"']+)["']/i);
if (!srcMatch)
return imgTag;
const src = srcMatch[1];
const currentIndex = imageUrls.indexOf(src);
let updatedAttributes = attributes;
if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match, classes) => {
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
: 'max-w-[400px] object-contain cursor-zoom-in';
return `class="${newClasses}"`;
});
}
else {
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`;
}
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '&quot;')}"`;
return `<img${updatedAttributes}>`;
});
}
/**
* Clean URL by removing tracking parameters
* Based on jumble's cleanUrl function
*/
function cleanUrl(url) {
try {
const parsedUrl = new URL(url);
// List of tracking parameter prefixes and exact names to remove
const trackingParams = [
// Google Analytics & Ads
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
// Facebook
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
// Twitter/X
'twclid', 'twsrc',
// Microsoft/Bing
'msclkid', 'mc_cid', 'mc_eid',
// Adobe
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
// Mailchimp
'mc_cid', 'mc_eid',
// HubSpot
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
// Marketo
'mkt_tok',
// YouTube
'si', 'feature', 'kw', 'pp',
// Other common tracking
'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
// Mobile app tracking
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
// Amazon
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
// Affiliate tracking
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
// Social media share tracking
'share', 'shared', 'sharesource'
];
// Remove all tracking parameters
trackingParams.forEach(param => {
parsedUrl.searchParams.delete(param);
});
// Remove any parameter that starts with utm_ or _
Array.from(parsedUrl.searchParams.keys()).forEach(key => {
if (key.startsWith('utm_') || key.startsWith('_')) {
parsedUrl.searchParams.delete(key);
}
});
return parsedUrl.toString();
}
catch {
// If URL parsing fails, return original URL
return url;
}
}
/**
* Clean up leftover markdown syntax
*/
function cleanupMarkdown(html) {
let cleaned = html;
// Clean up markdown image syntax
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const altText = alt || '';
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Escape for HTML attribute
const escapedUrl = cleanedUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`;
});
// Clean up markdown link syntax
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
if (cleaned.includes(`href="${url}"`)) {
return _match;
}
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Escape for HTML attribute
const escapedUrl = cleanedUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
// Escape text for HTML
const escapedText = text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
return cleaned;
}
/**
* Add proper CSS classes for styling
*/
function addStylingClasses(html) {
let styled = html;
// Add strikethrough styling
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>');
// Add subscript styling
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>');
// Add superscript styling
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>');
// Add code highlighting classes
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">');
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">');
return styled;
}
/**
* Hide raw AsciiDoc ToC text
*/
function hideRawTocText(html) {
let cleaned = html;
cleaned = cleaned.replace(/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi, '');
cleaned = cleaned.replace(/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi, '');
cleaned = cleaned.replace(/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi, '');
return cleaned;
}

326
src/processors/html-postprocess.ts

@ -3,6 +3,10 @@ import { processMusicalNotation } from './music'; @@ -3,6 +3,10 @@ import { processMusicalNotation } from './music';
export interface PostProcessOptions {
enableMusicalNotation?: boolean;
linkBaseURL?: string;
/** Custom URL format for wikilinks */
wikilinkUrl?: string | ((dtag: string) => string);
/** Custom URL format for hashtags */
hashtagUrl?: string | ((topic: string) => string);
}
/**
@ -18,7 +22,7 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}): @@ -18,7 +22,7 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`;
});
// Convert hashtag links to HTML (styled like links but not clickable)
// Convert hashtag links to HTML
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
// HTML escape the display text
const escapedDisplay = displayText
@ -27,8 +31,25 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}): @@ -27,8 +31,25 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Use span instead of <a> tag - same color as links but no underline and not clickable
return `<span class="hashtag-link">${escapedDisplay}</span>`;
// If hashtagUrl is configured, make it a clickable link
if (options.hashtagUrl) {
let url: string;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(normalizedHashtag);
} else {
// String template with {topic} placeholder
url = options.hashtagUrl.replace(/{topic}/g, normalizedHashtag);
}
// Escape URL for HTML attribute
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<a class="hashtag-link text-primary-600 dark:text-primary-500 hover:underline" data-topic="${normalizedHashtag.replace(/"/g, '&quot;')}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
} else {
// Default: Use span instead of <a> tag - same color as links but no underline and not clickable
return `<span class="hashtag-link">${escapedDisplay}</span>`;
}
});
// Convert WIKILINK:dtag|display placeholder format to HTML
@ -42,10 +63,24 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}): @@ -42,10 +63,24 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Always use relative URL for wikilinks (works on any domain)
const url = `/events?d=${escapedDtag}`;
// Generate URL using custom format or default
let url: string;
if (options.wikilinkUrl) {
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(dTag.trim());
} else {
// String template with {dtag} placeholder
url = options.wikilinkUrl.replace(/{dtag}/g, dTag.trim());
}
} else {
// Default format
url = `/events?d=${escapedDtag}`;
}
// Escape URL for HTML attribute
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${url}" href="${url}">${escapedDisplay}</a>`;
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${escapedUrl}" href="${escapedUrl}">${escapedDisplay}</a>`;
});
// Convert nostr: links to HTML
@ -67,6 +102,31 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}): @@ -67,6 +102,31 @@ export function postProcessHtml(html: string, options: PostProcessOptions = {}):
}
});
// Convert any leftover link: macros that AsciiDoctor didn't convert
// This handles cases where AsciiDoctor couldn't parse the link (e.g., link text with special chars)
// Pattern: link:url[text] where url is http/https and text can contain any characters
processed = processed.replace(/link:(https?:\/\/[^\[]+)\[([^\]]+)\]/g, (_match, url, text) => {
// Escape URL and text for HTML attributes
const escapedUrl = url.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
const escapedText = text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Check if link text contains wss:// or ws:// - these are relay URLs, don't add OpenGraph
const isRelayUrl = /wss?:\/\//i.test(text);
if (isRelayUrl) {
// Simple link without OpenGraph wrapper
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
} else {
// Regular link - will be processed by OpenGraph handler if external
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
}
});
// Process media URLs (YouTube, Spotify, video, audio)
processed = processMedia(processed);
@ -192,7 +252,71 @@ function processMedia(html: string): string { @@ -192,7 +252,71 @@ function processMedia(html: string): string {
* Process OpenGraph links - mark external links for OpenGraph preview fetching
*/
function processOpenGraphLinks(html: string, linkBaseURL?: string): string {
// First, clean up any corrupted HTML fragments that might interfere
// Remove "link:" prefixes that appear before links (AsciiDoc syntax that shouldn't be in HTML)
// This happens when AsciiDoctor doesn't fully convert link:url[text] syntax or when
// there's literal text like "should render like link:" before an anchor tag
let processed = html;
// Remove "link:" that appears immediately before anchor tags (most common case)
// Match "link:" followed by optional whitespace and then <a
processed = processed.replace(/link:\s*<a/gi, '<a');
// Remove "link:" that appears as plain text in HTML (shouldn't be there)
// Be careful not to match "link:" inside HTML attributes or tags
// Match "link:" that's not inside quotes or tags
processed = processed.replace(/([^"'>\s])link:([a-zA-Z0-9])/gi, '$1$2');
// Also handle cases where "link:" appears with whitespace before anchor tags
processed = processed.replace(/\s+link:\s*(?=<a\s+href)/gi, ' ');
// Clean up any corrupted href attributes that contain HTML fragments
processed = processed.replace(/href\s*=\s*["']([^"']*<[^"']*)["']/gi, (match, corruptedHref) => {
// If href contains HTML tags, extract just the URL part
const urlMatch = corruptedHref.match(/(https?:\/\/[^\s<>"']+)/i);
if (urlMatch) {
return `href="${urlMatch[1]}"`;
}
return match; // If we can't fix it, leave it (will be skipped by validation)
});
// Clean up any malformed anchor tag fragments that might cause issues
processed = processed.replace(/<a\s+href=["']([^"'>]*<[^"'>]*)["']/gi, (match, corruptedHref) => {
// Skip corrupted anchor tags - they'll be handled by the main regex with validation
return match;
});
// Clean up links inside code blocks - AsciiDoctor creates them but they should be plain text
// Remove <a> tags inside <code> blocks, keeping only the link text
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match, content) => {
// Remove any <a> tags inside code blocks, keeping only the text content
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return `<code>${cleaned}</code>`;
});
// Also clean up links inside pre blocks
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match, content) => {
const cleaned = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return `<pre>${cleaned}</pre>`;
});
// Now protect code blocks and pre blocks by replacing them with placeholders
const codeBlockPlaceholders: string[] = [];
const preBlockPlaceholders: string[] = [];
// Replace pre blocks first (they can contain code blocks)
processed = processed.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (match) => {
const placeholder = `__PREBLOCK_${preBlockPlaceholders.length}__`;
preBlockPlaceholders.push(match);
return placeholder;
});
// Replace code blocks
processed = processed.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, (match) => {
const placeholder = `__CODEBLOCK_${codeBlockPlaceholders.length}__`;
codeBlockPlaceholders.push(match);
return placeholder;
});
// Extract base domain from linkBaseURL if provided
let baseDomain: string | null = null;
@ -206,11 +330,88 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string { @@ -206,11 +330,88 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string {
// Ignore parsing errors
}
}
// Before processing, remove any corrupted opengraph containers that might have been created
// These have malformed data-og-url attributes containing HTML fragments
// Match all spans with data-og-url and check if they're corrupted
// Use a pattern that matches spans with data-og-url, then check the attribute value
processed = processed.replace(/<span[^>]*data-og-url=["']([^"']+)["'][^>]*>[\s\S]*?<\/span>/gi, (match) => {
// This span has a corrupted data-og-url (contains <)
// Extract the clean URL from the beginning of the attribute value
const dataOgUrlMatch = match.match(/data-og-url=["']([^"']+)["']/i);
if (dataOgUrlMatch && dataOgUrlMatch[1]) {
// Extract just the URL part (everything before the first <)
const urlMatch = dataOgUrlMatch[1].match(/(https?:\/\/[^\s<>"']+)/i);
if (urlMatch) {
const cleanUrl = urlMatch[1];
// Extract the link text from inside the span
const linkMatch = match.match(/<a[^>]*>(.*?)<\/a>/i);
const linkText = linkMatch ? linkMatch[1] : cleanUrl;
// Return a clean opengraph container with the fixed URL
const escapedUrl = cleanUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<div class="opengraph-preview" data-og-loading="true" style="display: none;">
<div class="opengraph-card">
<div class="opengraph-image-container">
<img class="opengraph-image" src="" alt="" style="display: none;" />
</div>
<div class="opengraph-content">
<div class="opengraph-site"></div>
<div class="opengraph-title"></div>
<div class="opengraph-description"></div>
</div>
</div>
</div>
</span>`;
}
// If we can't extract a clean URL, just remove the corrupted span and keep any text
const textMatch = match.match(/>([^<]+)</);
return textMatch ? textMatch[1] : '';
}
return match; // Keep valid spans
});
// Match external links (http/https) that aren't media, nostr, or wikilinks
// Skip links that are already in media embeds or special containers
// Use a more flexible regex that handles attributes in any order
processed = processed.replace(/<a\s+([^>]*?)href\s*=\s*["'](https?:\/\/[^"']+)["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => {
// Use a stricter regex that only matches valid, complete anchor tags
// The regex must match a complete <a> tag with proper structure
processed = processed.replace(/<a\s+([^>]*\s+)?href\s*=\s*["'](https?:\/\/[^"']{1,2048})["']([^>]*?)>(.*?)<\/a>/gis, (match, before, href, after, linkText) => {
// CRITICAL: Validate href FIRST - if it contains ANY HTML tags or fragments, skip immediately
// This prevents corrupted HTML from being created
if (!href) {
return match; // Skip if no href
}
// Skip if href contains HTML tags or looks corrupted - be very strict
// Check for common HTML fragments that indicate corruption
if (href.includes('<') || href.includes('>') || href.includes('href=') || href.includes('</a>') || href.includes('<a') || href.includes('"') || href.includes("'")) {
return match; // Skip if href looks corrupted
}
// Additional validation: href should only contain URL-safe characters
// URLs shouldn't contain unescaped quotes or HTML tags
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
return match; // Skip if href doesn't match clean URL pattern
}
// Validate href is a proper URL (starts with http:// or https:// and doesn't contain invalid chars)
if (!/^https?:\/\/[^\s<>"']+$/i.test(href)) {
return match; // Skip if href doesn't match URL pattern
}
// Skip if the match contains unclosed tags or corrupted HTML
const openATags = (match.match(/<a\s/g) || []).length;
const closeATags = (match.match(/<\/a>/g) || []).length;
if (openATags !== closeATags || openATags !== 1) {
return match; // Multiple or mismatched <a> tags = corrupted
}
// Skip if match contains nested HTML that looks corrupted
if (match.includes('href="') && match.split('href="').length > 2) {
return match; // Multiple href attributes = corrupted
}
// Skip if it's already a media embed, nostr link, wikilink, or opengraph link
if (match.includes('class="wikilink"') ||
match.includes('class="nostr-link"') ||
@ -233,6 +434,12 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string { @@ -233,6 +434,12 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string {
return match;
}
// Skip if link text contains wss:// or ws:// - these are relay URLs, not web pages
// They don't need OpenGraph previews
if (/wss?:\/\//i.test(linkText)) {
return match;
}
// Check if it's an external link (not same domain)
let isExternal = true;
if (baseDomain) {
@ -260,7 +467,7 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string { @@ -260,7 +467,7 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string {
// Add data attribute for OpenGraph fetching and wrap in container
// The actual OpenGraph fetching will be done client-side via JavaScript
return `<span class="opengraph-link-container" data-og-url="${escapedUrl}">
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg class="size-3" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="opengraph-link break-words inline-flex items-baseline gap-1">${linkText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>
<div class="opengraph-preview" data-og-loading="true" style="display: none;">
<div class="opengraph-card">
<div class="opengraph-image-container">
@ -276,6 +483,16 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string { @@ -276,6 +483,16 @@ function processOpenGraphLinks(html: string, linkBaseURL?: string): string {
</span>`;
});
// Restore code blocks
codeBlockPlaceholders.forEach((codeBlock, index) => {
processed = processed.replace(`__CODEBLOCK_${index}__`, codeBlock);
});
// Restore pre blocks
preBlockPlaceholders.forEach((preBlock, index) => {
processed = processed.replace(`__PREBLOCK_${index}__`, preBlock);
});
return processed;
}
@ -321,6 +538,81 @@ function processImages(html: string): string { @@ -321,6 +538,81 @@ function processImages(html: string): string {
});
}
/**
* Clean URL by removing tracking parameters
* Based on jumble's cleanUrl function
*/
function cleanUrl(url: string): string {
try {
const parsedUrl = new URL(url);
// List of tracking parameter prefixes and exact names to remove
const trackingParams = [
// Google Analytics & Ads
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'utm_id', 'utm_source_platform', 'utm_creative_format', 'utm_marketing_tactic',
'gclid', 'gclsrc', 'dclid', 'gbraid', 'wbraid',
// Facebook
'fbclid', 'fb_action_ids', 'fb_action_types', 'fb_source', 'fb_ref',
// Twitter/X
'twclid', 'twsrc',
// Microsoft/Bing
'msclkid', 'mc_cid', 'mc_eid',
// Adobe
'adobe_mc', 'adobe_mc_ref', 'adobe_mc_sdid',
// Mailchimp
'mc_cid', 'mc_eid',
// HubSpot
'hsCtaTracking', 'hsa_acc', 'hsa_cam', 'hsa_grp', 'hsa_ad', 'hsa_src', 'hsa_tgt', 'hsa_kw', 'hsa_mt', 'hsa_net', 'hsa_ver',
// Marketo
'mkt_tok',
// YouTube
'si', 'feature', 'kw', 'pp',
// Other common tracking
'ref', 'referrer', 'source', 'campaign', 'medium', 'content',
'yclid', 'srsltid', '_ga', '_gl', 'igshid', 'epik', 'pk_campaign', 'pk_kwd',
// Mobile app tracking
'adjust_tracker', 'adjust_campaign', 'adjust_adgroup', 'adjust_creative',
// Amazon
'tag', 'linkCode', 'creative', 'creativeASIN', 'linkId', 'ascsubtag',
// Affiliate tracking
'aff_id', 'affiliate_id', 'aff', 'ref_', 'refer',
// Social media share tracking
'share', 'shared', 'sharesource'
];
// Remove all tracking parameters
trackingParams.forEach(param => {
parsedUrl.searchParams.delete(param);
});
// Remove any parameter that starts with utm_ or _
Array.from(parsedUrl.searchParams.keys()).forEach(key => {
if (key.startsWith('utm_') || key.startsWith('_')) {
parsedUrl.searchParams.delete(key);
}
});
return parsedUrl.toString();
} catch {
// If URL parsing fails, return original URL
return url;
}
}
/**
* Clean up leftover markdown syntax
*/
@ -330,7 +622,11 @@ function cleanupMarkdown(html: string): string { @@ -330,7 +622,11 @@ function cleanupMarkdown(html: string): string {
// Clean up markdown image syntax
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const altText = alt || '';
return `<img src="${url}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`;
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Escape for HTML attribute
const escapedUrl = cleanedUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<img src="${escapedUrl}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`;
});
// Clean up markdown link syntax
@ -338,7 +634,13 @@ function cleanupMarkdown(html: string): string { @@ -338,7 +634,13 @@ function cleanupMarkdown(html: string): string {
if (cleaned.includes(`href="${url}"`)) {
return _match;
}
return `<a href="${url}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${text} <svg class="size-3" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
// Clean URL (remove tracking parameters)
const cleanedUrl = cleanUrl(url);
// Escape for HTML attribute
const escapedUrl = cleanedUrl.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
// Escape text for HTML
const escapedText = text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
return `<a href="${escapedUrl}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${escapedText} <svg style="width: 0.75rem; height: 0.75rem; flex-shrink: 0;" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
return cleaned;

239
src/processors/html-utils.js

@ -0,0 +1,239 @@ @@ -0,0 +1,239 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractTOC = extractTOC;
exports.sanitizeHTML = sanitizeHTML;
exports.processLinks = processLinks;
/**
* Extracts the table of contents from AsciiDoc HTML output
* Returns the TOC HTML and the content HTML without the TOC
*/
function extractTOC(html) {
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
let tocContent = '';
let contentWithoutTOC = html;
// Find the start of the TOC div - try multiple patterns
const tocStartPatterns = [
/<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>/i,
/<div\s+id=["']toc["'][^>]*>/i,
/<div\s+class=["']toc["'][^>]*>/i,
/<nav\s+id=["']toc["'][^>]*>/i,
];
let tocStartIdx = -1;
let tocStartTag = '';
for (const pattern of tocStartPatterns) {
const match = html.match(pattern);
if (match && match.index !== undefined) {
tocStartIdx = match.index;
tocStartTag = match[0];
break;
}
}
if (tocStartIdx === -1) {
// No TOC found
return { toc: '', contentWithoutTOC: html };
}
// Find the matching closing tag by counting div/nav tags
const searchStart = tocStartIdx + tocStartTag.length;
let depth = 1;
let i = searchStart;
while (i < html.length && depth > 0) {
// Look for opening or closing div/nav tags
if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') {
// Check if it's a closing tag
if (i + 5 < html.length && html[i + 4] === '/') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
i = closeIdx + 1;
}
else {
// Opening tag - find the end (handle attributes and self-closing)
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
// Check if it's self-closing (look for /> before the >)
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
}
}
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
i = closeIdx + 1;
}
else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
i = closeIdx + 1;
}
else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') {
// Handle opening nav tags
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1)
break;
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
}
else {
i++;
}
}
if (depth === 0) {
// Found the matching closing tag
const tocEndIdx = i;
// Extract the TOC content (inner HTML)
const tocFullHTML = html.substring(tocStartIdx, tocEndIdx);
// Extract just the inner content (without the outer div tags)
let innerStart = tocStartTag.length;
let innerEnd = tocFullHTML.length;
// Find the last </div> or </nav>
if (tocFullHTML.endsWith('</div>')) {
innerEnd -= 6;
}
else if (tocFullHTML.endsWith('</nav>')) {
innerEnd -= 7;
}
tocContent = tocFullHTML.substring(innerStart, innerEnd).trim();
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
tocContent = tocContent.replace(/<div\s+id=["']toctitle["'][^>]*>.*?<\/div>\s*/gis, '');
tocContent = tocContent.trim();
// Remove the TOC from the content
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx);
}
// Extract just the body content if the HTML includes full document structure
// AsciiDoctor might return full HTML with <html>, <head>, <body> tags
// Check if this is a full HTML document
const isFullDocument = /^\s*<!DOCTYPE|^\s*<html/i.test(contentWithoutTOC);
if (isFullDocument) {
// Extract body content using a more robust approach
// Find the opening <body> tag
const bodyStartMatch = contentWithoutTOC.match(/<body[^>]*>/i);
if (bodyStartMatch && bodyStartMatch.index !== undefined) {
const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length;
// Find the closing </body> tag by searching backwards from the end
// This is more reliable than regex for nested content
const bodyEndMatch = contentWithoutTOC.lastIndexOf('</body>');
if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) {
contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim();
}
}
}
// Remove any remaining document structure tags that might have slipped through
contentWithoutTOC = contentWithoutTOC
.replace(/<html[^>]*>/gi, '')
.replace(/<\/html>/gi, '')
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '')
.replace(/<body[^>]*>/gi, '')
.replace(/<\/body>/gi, '');
// Clean up any extra whitespace
contentWithoutTOC = contentWithoutTOC.trim();
return { toc: tocContent, contentWithoutTOC };
}
/**
* Performs basic HTML sanitization to prevent XSS
*/
function sanitizeHTML(html) {
// Remove script tags and their content
html = html.replace(/<script[^>]*>.*?<\/script>/gis, '');
// Remove event handlers (onclick, onerror, etc.)
html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, '');
// Remove javascript: protocol in links
html = html.replace(/javascript:/gi, '');
// Remove data: URLs that could be dangerous
html = html.replace(/data:\s*text\/html/gi, '');
return html;
}
/**
* Processes HTML links to add target="_blank" to external links
* This function is available for use but not currently called automatically.
* It can be used in post-processing if needed.
*/
function processLinks(html, linkBaseURL) {
// Extract domain from linkBaseURL for comparison
let linkBaseDomain = '';
if (linkBaseURL) {
try {
// Use URL constructor if available (Node.js 10+)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = globalThis.URL;
if (URLConstructor) {
const url = new URLConstructor(linkBaseURL);
linkBaseDomain = url.hostname;
}
else {
throw new Error('URL not available');
}
}
catch {
// Fallback to simple string parsing if URL constructor fails
const url = linkBaseURL.replace(/^https?:\/\//, '');
const parts = url.split('/');
if (parts.length > 0) {
linkBaseDomain = parts[0];
}
}
}
// Regex to match <a> tags with href attributes
const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g;
return html.replace(linkRegex, (match, before, href, after) => {
// Check if it's an external link (starts with http:// or https://)
const isExternal = href.startsWith('http://') || href.startsWith('https://');
if (isExternal) {
// Check if it's pointing to our own domain
if (linkBaseDomain) {
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = globalThis.URL;
if (URLConstructor) {
const hrefUrl = new URLConstructor(href);
if (hrefUrl.hostname === linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
else {
throw new Error('URL not available');
}
}
catch {
// If URL parsing fails, use simple string check
if (href.includes(linkBaseDomain)) {
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
}
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
if (!match.includes('target=')) {
if (!match.includes('rel=')) {
return match.replace('>', ' target="_blank" rel="noopener noreferrer">');
}
else {
// Update existing rel attribute to include noopener if not present
const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => {
if (!relValue.includes('noopener')) {
return `rel="${relValue} noopener noreferrer"`;
}
return relMatch;
});
return updatedMatch.replace('>', ' target="_blank">');
}
}
}
else {
// Local/relative link - ensure it opens in same tab (remove target if present)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
return match;
});
}

33
src/processors/html-utils.ts

@ -105,6 +105,39 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri @@ -105,6 +105,39 @@ export function extractTOC(html: string): { toc: string; contentWithoutTOC: stri
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx);
}
// Extract just the body content if the HTML includes full document structure
// AsciiDoctor might return full HTML with <html>, <head>, <body> tags
// Check if this is a full HTML document
const isFullDocument = /^\s*<!DOCTYPE|^\s*<html/i.test(contentWithoutTOC);
if (isFullDocument) {
// Extract body content using a more robust approach
// Find the opening <body> tag
const bodyStartMatch = contentWithoutTOC.match(/<body[^>]*>/i);
if (bodyStartMatch && bodyStartMatch.index !== undefined) {
const bodyStart = bodyStartMatch.index + bodyStartMatch[0].length;
// Find the closing </body> tag by searching backwards from the end
// This is more reliable than regex for nested content
const bodyEndMatch = contentWithoutTOC.lastIndexOf('</body>');
if (bodyEndMatch !== -1 && bodyEndMatch > bodyStart) {
contentWithoutTOC = contentWithoutTOC.substring(bodyStart, bodyEndMatch).trim();
}
}
}
// Remove any remaining document structure tags that might have slipped through
contentWithoutTOC = contentWithoutTOC
.replace(/<html[^>]*>/gi, '')
.replace(/<\/html>/gi, '')
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '')
.replace(/<body[^>]*>/gi, '')
.replace(/<\/body>/gi, '');
// Clean up any extra whitespace
contentWithoutTOC = contentWithoutTOC.trim();
return { toc: tocContent, contentWithoutTOC };
}

143
src/processors/music.js

@ -0,0 +1,143 @@ @@ -0,0 +1,143 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.processMusicalNotation = processMusicalNotation;
/**
* Processes musical notation in HTML content
* Wraps musical notation in appropriate HTML for rendering
*/
function processMusicalNotation(html) {
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
// These were created by a buggy regex that matched the entire HTML document
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => {
// This is corrupted - extract just the ABC notation from the beginning
let decoded = dataAbc
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// Find the actual ABC notation (starts with X:)
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|&lt;|<\/|sect|div|pre|code)/);
if (abcMatch) {
const cleanAbc = abcMatch[1].trim();
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`;
}
// If we can't extract clean ABC, remove the div entirely
return content;
});
// Clean up code blocks that contain corrupted abc-notation divs inside them
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i);
if (longDataAbcMatch) {
// Extract just the ABC notation from the beginning of the corrupted data-abc value
let decoded = longDataAbcMatch[1]
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// The ABC notation ends where the HTML document starts (</code> or </pre>)
// Extract everything from X: up to (but not including) &lt;/code&gt; or &lt;/pre&gt;
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=&lt;\/code&gt;|&lt;\/pre&gt;)/);
if (abcMatch) {
let cleanAbc = abcMatch[1].trim();
// Remove any trailing HTML entities
cleanAbc = cleanAbc.replace(/&lt;.*$/, '').trim();
// Validate it's reasonable ABC notation
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) {
// Return clean code block - the processing step will wrap it in abc-notation div
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`;
}
}
// If extraction fails, just remove the corrupted div and return empty code block
// This prevents the corrupted data from being rendered
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`;
}
return match;
});
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
// We do NOT auto-detect ABC notation - it must be explicitly marked
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Skip if already processed or corrupted
if (codeContent.includes('abc-notation') ||
codeContent.includes('class="abc-notation"') ||
codeContent.includes('<div') ||
codeContent.includes('</div>') ||
codeContent.length > 5000) {
return match;
}
// Extract ABC content from the code block
let abcContent = codeContent
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#x27;/g, "'")
.replace(/&#x2F;/g, '/');
// Remove any HTML tags
abcContent = abcContent.replace(/<[^>]+>/g, '').trim();
// Only process if it looks like valid ABC notation (starts with X:)
// Since this is explicitly marked as ABC, we trust it's ABC notation
if (abcContent.match(/^X:\s*\d+/m) &&
abcContent.length < 3000 &&
!abcContent.includes('</') &&
!abcContent.includes('<div') &&
!abcContent.includes('sect') &&
!abcContent.includes('class=')) {
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
const lines = abcContent.split('\n');
const abcLines = [];
for (const line of lines) {
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) {
break;
}
if (line.length > 200) {
break;
}
abcLines.push(line);
if (abcLines.join('\n').length > 2000) {
break;
}
}
const cleanAbc = abcLines.join('\n').trim();
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) {
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`;
}
}
return match;
});
// Process LilyPond notation blocks
const lilypondPattern = /(\\relative[^}]+})/gs;
html = html.replace(lilypondPattern, (match) => {
const lilypondContent = match.trim();
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`;
});
// Process inline chord notation: [C], [Am], [F#m7], etc.
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g;
html = html.replace(chordPattern, (match, chord) => {
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`;
});
// Process MusicXML-like notation
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs;
html = html.replace(musicxmlPattern, (match) => {
const musicxmlContent = match.trim();
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`;
});
return html;
}
/**
* Escapes a string for use in HTML attributes
*/
function escapeForAttr(text) {
return text
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/\n/g, ' ')
.replace(/\r/g, '');
}

115
src/processors/music.ts

@ -3,11 +3,116 @@ @@ -3,11 +3,116 @@
* Wraps musical notation in appropriate HTML for rendering
*/
export function processMusicalNotation(html: string): string {
// Process ABC notation blocks
const abcBlockPattern = /(X:\s*\d+[^\n]*\n(?:[^\n]+\n)*)/gs;
html = html.replace(abcBlockPattern, (match) => {
const abcContent = match.trim();
return `<div class="abc-notation" data-abc="${escapeForAttr(abcContent)}">${abcContent}</div>`;
// First, clean up any corrupted abc-notation divs with very long data-abc attributes
// These were created by a buggy regex that matched the entire HTML document
html = html.replace(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"[^>]*>([\s\S]*?)<\/div>/gi, (match, dataAbc, content) => {
// This is corrupted - extract just the ABC notation from the beginning
let decoded = dataAbc
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// Find the actual ABC notation (starts with X:)
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]{0,2000}?)(?:\n[^XTCMALK]|&lt;|<\/|sect|div|pre|code)/);
if (abcMatch) {
const cleanAbc = abcMatch[1].trim();
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${content}</div>`;
}
// If we can't extract clean ABC, remove the div entirely
return content;
});
// Clean up code blocks that contain corrupted abc-notation divs inside them
// The corrupted structure is: <code><div class="abc-notation" data-abc="...entire HTML...">...</div></code>
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Check if codeContent contains an abc-notation div with a very long data-abc attribute (>500 chars = corrupted)
const longDataAbcMatch = codeContent.match(/<div[^>]*class="[^"]*abc-notation[^"]*"[^>]*data-abc="([^"]{500,})"/i);
if (longDataAbcMatch) {
// Extract just the ABC notation from the beginning of the corrupted data-abc value
let decoded = longDataAbcMatch[1]
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// The ABC notation ends where the HTML document starts (</code> or </pre>)
// Extract everything from X: up to (but not including) &lt;/code&gt; or &lt;/pre&gt;
const abcMatch = decoded.match(/^(X:\s*\d+[\s\S]*?)(?=&lt;\/code&gt;|&lt;\/pre&gt;)/);
if (abcMatch) {
let cleanAbc = abcMatch[1].trim();
// Remove any trailing HTML entities
cleanAbc = cleanAbc.replace(/&lt;.*$/, '').trim();
// Validate it's reasonable ABC notation
if (cleanAbc.length > 10 && cleanAbc.length < 2000 && cleanAbc.match(/^X:\s*\d+/m)) {
// Return clean code block - the processing step will wrap it in abc-notation div
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc">${cleanAbc}</code></pre>`;
}
}
// If extraction fails, just remove the corrupted div and return empty code block
// This prevents the corrupted data from being rendered
return `<pre class="highlightjs hljs"><code class="language-abc hljs" data-lang="abc"></code></pre>`;
}
return match;
});
// Process ABC notation blocks - ONLY code blocks explicitly marked with language-abc class
// These come from: [source,abc], [source, abc], [abc] in AsciiDoc, or ```abc in Markdown
// We do NOT auto-detect ABC notation - it must be explicitly marked
html = html.replace(/<pre[^>]*><code[^>]*class="[^"]*language-abc[^"]*"[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (match, codeContent) => {
// Skip if already processed or corrupted
if (codeContent.includes('abc-notation') ||
codeContent.includes('class="abc-notation"') ||
codeContent.includes('<div') ||
codeContent.includes('</div>') ||
codeContent.length > 5000) {
return match;
}
// Extract ABC content from the code block
let abcContent = codeContent
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#x27;/g, "'")
.replace(/&#x2F;/g, '/');
// Remove any HTML tags
abcContent = abcContent.replace(/<[^>]+>/g, '').trim();
// Only process if it looks like valid ABC notation (starts with X:)
// Since this is explicitly marked as ABC, we trust it's ABC notation
if (abcContent.match(/^X:\s*\d+/m) &&
abcContent.length < 3000 &&
!abcContent.includes('</') &&
!abcContent.includes('<div') &&
!abcContent.includes('sect') &&
!abcContent.includes('class=')) {
// Extract just the ABC notation (stop at first non-ABC line or reasonable limit)
const lines = abcContent.split('\n');
const abcLines: string[] = [];
for (const line of lines) {
if (line.includes('</') || line.includes('<div') || line.includes('sect') || line.includes('class=')) {
break;
}
if (line.length > 200) {
break;
}
abcLines.push(line);
if (abcLines.join('\n').length > 2000) {
break;
}
}
const cleanAbc = abcLines.join('\n').trim();
if (cleanAbc.match(/^X:\s*\d+/m) && cleanAbc.length > 10 && cleanAbc.length < 2000) {
return `<div class="abc-notation" data-abc="${escapeForAttr(cleanAbc)}">${match}</div>`;
}
}
return match;
});
// Process LilyPond notation blocks

14
src/types.js

@ -0,0 +1,14 @@ @@ -0,0 +1,14 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ContentFormat = void 0;
/**
* Detected content format
*/
var ContentFormat;
(function (ContentFormat) {
ContentFormat["Unknown"] = "unknown";
ContentFormat["AsciiDoc"] = "asciidoc";
ContentFormat["Markdown"] = "markdown";
ContentFormat["Wikipedia"] = "wikipedia";
ContentFormat["Plain"] = "plain";
})(ContentFormat || (exports.ContentFormat = ContentFormat = {}));

16
src/types.ts

@ -16,6 +16,20 @@ export interface ParserOptions { @@ -16,6 +16,20 @@ export interface ParserOptions {
enableMusicalNotation?: boolean;
/** Enable nostr: address processing (default: true) */
enableNostrAddresses?: boolean;
/**
* Custom URL format for wikilinks. Can be:
* - A string template with {dtag} placeholder: "/d/{dtag}" or "/events?d={dtag}"
* - A function that takes dtag and returns URL: (dtag: string) => `/d/${dtag}`
* Default: "/events?d={dtag}"
*/
wikilinkUrl?: string | ((dtag: string) => string);
/**
* Custom URL format for hashtags. Can be:
* - A string template with {topic} placeholder: "/notes?t={topic}" or "/hashtag/{topic}"
* - A function that takes topic (hashtag without #) and returns URL: (topic: string) => `/notes?t=${topic}`
* Default: undefined (hashtags rendered as non-clickable spans)
*/
hashtagUrl?: string | ((topic: string) => string);
}
/**
@ -49,6 +63,8 @@ export interface ProcessResult { @@ -49,6 +63,8 @@ export interface ProcessResult {
hasLaTeX: boolean;
/** Indicates if musical notation was found */
hasMusicalNotation: boolean;
/** Extracted YAML front matter (if present) */
frontmatter?: Record<string, any>;
/** Extracted Nostr links */
nostrLinks: NostrLink[];
/** Extracted wikilinks */

628
test-parser-report.test.ts

@ -0,0 +1,628 @@ @@ -0,0 +1,628 @@
import { Parser } from './src/parser';
import * as fs from 'fs';
import * as path from 'path';
/**
* Test that parses both markdown and asciidoc test documents
* and generates an HTML report showing the parsing results
*/
describe('Parser Test Report', () => {
const parser = new Parser({
linkBaseURL: 'https://example.com',
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/notes?t={topic}',
});
test('Generate HTML test report for markdown and asciidoc documents', async () => {
// Read test documents
const markdownContent = fs.readFileSync(
path.join(__dirname, 'markdown_testdoc.md'),
'utf-8'
);
const asciidocContent = fs.readFileSync(
path.join(__dirname, 'asciidoc_testdoc.adoc'),
'utf-8'
);
// Parse both documents
const markdownResult = await parser.process(markdownContent);
const asciidocResult = await parser.process(asciidocContent);
// Generate HTML report
const htmlReport = generateHTMLReport({
markdown: {
original: markdownContent,
result: markdownResult,
},
asciidoc: {
original: asciidocContent,
result: asciidocResult,
},
});
// Write HTML report to file
const reportPath = path.join(__dirname, 'test-report.html');
fs.writeFileSync(reportPath, htmlReport, 'utf-8');
console.log(`\n✅ Test report generated: ${reportPath}`);
console.log(` Open this file in your browser to view the results.\n`);
// Basic assertions to ensure parsing worked
expect(markdownResult.content).toBeTruthy();
expect(asciidocResult.content).toBeTruthy();
expect(markdownResult.content.length).toBeGreaterThan(0);
expect(asciidocResult.content.length).toBeGreaterThan(0);
});
});
interface TestData {
original: string;
result: any;
}
interface ReportData {
markdown: TestData;
asciidoc: TestData;
}
function generateHTMLReport(data: ReportData): string {
const { markdown, asciidoc } = data;
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GC Parser Test Report</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.6;
color: #333;
background: #f5f5f5;
padding: 20px;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
color: #2c3e50;
margin-bottom: 10px;
font-size: 2.5em;
}
.subtitle {
color: #7f8c8d;
margin-bottom: 30px;
font-size: 1.1em;
}
.section {
background: white;
border-radius: 8px;
padding: 30px;
margin-bottom: 30px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.section h2 {
color: #34495e;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #3498db;
font-size: 1.8em;
}
.section h3 {
color: #2c3e50;
margin-top: 25px;
margin-bottom: 15px;
font-size: 1.3em;
}
.tabs {
display: flex;
gap: 10px;
margin-bottom: 20px;
border-bottom: 2px solid #e0e0e0;
}
.tab {
padding: 12px 24px;
background: #f8f9fa;
border: none;
border-top-left-radius: 6px;
border-top-right-radius: 6px;
cursor: pointer;
font-size: 1em;
font-weight: 500;
color: #555;
transition: all 0.2s;
}
.tab:hover {
background: #e9ecef;
}
.tab.active {
background: #3498db;
color: white;
}
.tab-content {
display: none;
}
.tab-content.active {
display: block;
}
.metadata-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 15px;
margin-top: 15px;
}
.metadata-item {
background: #f8f9fa;
padding: 12px;
border-radius: 4px;
border-left: 3px solid #3498db;
}
.metadata-item strong {
color: #2c3e50;
display: block;
margin-bottom: 5px;
}
.metadata-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.9em;
}
.code-block {
background: #2d2d2d;
color: #f8f8f2;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.9em;
line-height: 1.5;
margin: 15px 0;
max-height: 400px;
overflow-y: auto;
}
.code-block pre {
margin: 0;
white-space: pre-wrap;
word-wrap: break-word;
}
.rendered-output {
background: white;
border: 1px solid #ddd;
padding: 20px;
border-radius: 6px;
margin: 15px 0;
min-height: 200px;
}
.rendered-output :global(*) {
max-width: 100%;
}
.stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 15px;
margin-top: 20px;
}
.stat-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
.stat-card .number {
font-size: 2.5em;
font-weight: bold;
margin-bottom: 5px;
}
.stat-card .label {
font-size: 0.9em;
opacity: 0.9;
}
.list-item {
background: #f8f9fa;
padding: 8px 12px;
margin: 5px 0;
border-radius: 4px;
border-left: 3px solid #95a5a6;
}
.list-item code {
background: #e9ecef;
padding: 2px 6px;
border-radius: 3px;
font-size: 0.85em;
}
.success-badge {
display: inline-block;
background: #27ae60;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.warning-badge {
display: inline-block;
background: #f39c12;
color: white;
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 500;
margin-left: 10px;
}
.comparison {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
margin-top: 20px;
}
@media (max-width: 768px) {
.comparison {
grid-template-columns: 1fr;
}
}
.json-view {
background: #f8f9fa;
padding: 15px;
border-radius: 6px;
overflow-x: auto;
font-family: 'Courier New', monospace;
font-size: 0.85em;
max-height: 300px;
overflow-y: auto;
}
</style>
</head>
<body>
<div class="container">
<h1>GC Parser Test Report</h1>
<p class="subtitle">Generated: ${new Date().toLocaleString()}</p>
<!-- Markdown Section -->
<div class="section">
<h2>Markdown Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('md-overview')">Overview</button>
<button class="tab" onclick="showTab('md-original')">Original Content</button>
<button class="tab" onclick="showTab('md-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('md-metadata')">Metadata</button>
</div>
<div id="md-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${markdown.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${markdown.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${markdown.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(markdown.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="md-original" class="tab-content">
<h3>Original Markdown Content</h3>
<div class="code-block">
<pre>${escapeHtml(markdown.original)}</pre>
</div>
</div>
<div id="md-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${markdown.result.content}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(markdown.result.content)}</pre>
</div>
</details>
</div>
<div id="md-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${markdown.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${markdown.result.nostrLinks.length})</h4>
${markdown.result.nostrLinks.map(link => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${markdown.result.wikilinks.length})</h4>
${markdown.result.wikilinks.map(wl => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${markdown.result.hashtags.length > 0 ? `
<h4>Hashtags (${markdown.result.hashtags.length})</h4>
${markdown.result.hashtags.map(tag => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${markdown.result.links.length > 0 ? `
<h4>Links (${markdown.result.links.length})</h4>
${markdown.result.links.map(link => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${markdown.result.media.length > 0 ? `
<h4>Media URLs (${markdown.result.media.length})</h4>
${markdown.result.media.map(url => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${markdown.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${markdown.result.tableOfContents}
</div>
` : ''}
</div>
</div>
<!-- AsciiDoc Section -->
<div class="section">
<h2>AsciiDoc Document Test <span class="success-badge"> Parsed</span></h2>
<div class="tabs">
<button class="tab active" onclick="showTab('ad-overview')">Overview</button>
<button class="tab" onclick="showTab('ad-original')">Original Content</button>
<button class="tab" onclick="showTab('ad-rendered')">Rendered Output</button>
<button class="tab" onclick="showTab('ad-metadata')">Metadata</button>
</div>
<div id="ad-overview" class="tab-content active">
<div class="stats">
<div class="stat-card">
<div class="number">${asciidoc.result.nostrLinks.length}</div>
<div class="label">Nostr Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.wikilinks.length}</div>
<div class="label">Wikilinks</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hashtags.length}</div>
<div class="label">Hashtags</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.links.length}</div>
<div class="label">Links</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.media.length}</div>
<div class="label">Media URLs</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasLaTeX ? 'Yes' : 'No'}</div>
<div class="label">Has LaTeX</div>
</div>
<div class="stat-card">
<div class="number">${asciidoc.result.hasMusicalNotation ? 'Yes' : 'No'}</div>
<div class="label">Has Music</div>
</div>
</div>
<h3>Frontmatter</h3>
${asciidoc.result.frontmatter ? `
<div class="metadata-grid">
${Object.entries(asciidoc.result.frontmatter).map(([key, value]) => `
<div class="metadata-item">
<strong>${escapeHtml(key)}</strong>
<code>${escapeHtml(JSON.stringify(value))}</code>
</div>
`).join('')}
</div>
` : '<p><em>No frontmatter found</em></p>'}
</div>
<div id="ad-original" class="tab-content">
<h3>Original AsciiDoc Content</h3>
<div class="code-block">
<pre>${escapeHtml(asciidoc.original)}</pre>
</div>
</div>
<div id="ad-rendered" class="tab-content">
<h3>Rendered HTML Output</h3>
<div class="rendered-output">
${asciidoc.result.content}
</div>
<details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
<div class="code-block" style="margin-top: 10px;">
<pre>${escapeHtml(asciidoc.result.content)}</pre>
</div>
</details>
</div>
<div id="ad-metadata" class="tab-content">
<h3>Extracted Metadata</h3>
${asciidoc.result.nostrLinks.length > 0 ? `
<h4>Nostr Links (${asciidoc.result.nostrLinks.length})</h4>
${asciidoc.result.nostrLinks.map(link => `
<div class="list-item">
<strong>${escapeHtml(link.type)}</strong>: <code>${escapeHtml(link.bech32)}</code>
${link.text ? ` - ${escapeHtml(link.text)}` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.wikilinks.length > 0 ? `
<h4>Wikilinks (${asciidoc.result.wikilinks.length})</h4>
${asciidoc.result.wikilinks.map(wl => `
<div class="list-item">
<code>${escapeHtml(wl.original)}</code> dtag: <code>${escapeHtml(wl.dtag)}</code>
${wl.display ? ` (display: ${escapeHtml(wl.display)})` : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.hashtags.length > 0 ? `
<h4>Hashtags (${asciidoc.result.hashtags.length})</h4>
${asciidoc.result.hashtags.map(tag => `
<div class="list-item">
<code>#${escapeHtml(tag)}</code>
</div>
`).join('')}
` : ''}
${asciidoc.result.links.length > 0 ? `
<h4>Links (${asciidoc.result.links.length})</h4>
${asciidoc.result.links.map(link => `
<div class="list-item">
<a href="${escapeHtml(link.url)}" target="_blank">${escapeHtml(link.text || link.url)}</a>
${link.isExternal ? '<span class="warning-badge">External</span>' : ''}
</div>
`).join('')}
` : ''}
${asciidoc.result.media.length > 0 ? `
<h4>Media URLs (${asciidoc.result.media.length})</h4>
${asciidoc.result.media.map(url => `
<div class="list-item">
<a href="${escapeHtml(url)}" target="_blank">${escapeHtml(url)}</a>
</div>
`).join('')}
` : ''}
${asciidoc.result.tableOfContents ? `
<h4>Table of Contents</h4>
<div class="rendered-output">
${asciidoc.result.tableOfContents}
</div>
` : ''}
</div>
</div>
</div>
<script>
function showTab(tabId) {
// Hide all tab contents
const allContents = document.querySelectorAll('.tab-content');
allContents.forEach(content => content.classList.remove('active'));
// Remove active class from all tabs
const allTabs = document.querySelectorAll('.tab');
allTabs.forEach(tab => tab.classList.remove('active'));
// Show selected tab content
const selectedContent = document.getElementById(tabId);
if (selectedContent) {
selectedContent.classList.add('active');
}
// Add active class to clicked tab
event.target.classList.add('active');
}
</script>
</body>
</html>`;
}
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;',
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

13415
test-report.html

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save