Compare commits

..

No commits in common. '3498f764d450f78d1c4a1b67e084eb10ff29d7eb' and 'ff8f3c47a9e90e60850f52b1a2c8fca8d322556e' have entirely different histories.

  1. 1
      .gitignore
  2. 388
      asciidoc_testdoc.adoc
  3. 55
      example.js
  4. 23
      jest.config.js
  5. 277
      markdown_testdoc.md
  6. 18
      package.json
  7. 353
      src/__tests__/asciidoc.test.ts
  8. 238
      src/__tests__/parser.test.ts
  9. 332
      src/converters/to-asciidoc.ts
  10. 101
      src/detector.ts
  11. 274
      src/extractors/metadata.ts
  12. 220
      src/parser.ts
  13. 562
      src/post-processor.ts
  14. 175
      src/pre-processor.ts
  15. 216
      src/processors/asciidoc.ts
  16. 212
      src/processors/html-postprocess.ts
  17. 211
      src/processors/html-utils.ts
  18. 244
      src/processors/markdown.ts
  19. 47
      src/processors/music.ts
  20. 31
      src/types.ts
  21. 20
      src/types/asciidoctor.d.ts
  22. 1
      tsconfig.json

1
.gitignore vendored

@ -20,7 +20,6 @@ node_modules/ @@ -20,7 +20,6 @@ node_modules/
package-lock.json
dist/
*.log
test-output/
# IDE
.idea/

388
asciidoc_testdoc.adoc

@ -1,388 +0,0 @@ @@ -1,388 +0,0 @@
= AsciiDoc Test Document
Kismet Lee
2.9, October 31, 2021: Fall incarnation
:description: Test description
:author: Kismet Lee
:date: 2021-10-31
:version: 2.9
:status: Draft
:keywords: AsciiDoc, Test, Document
:category: Test
:language: English
== Bullet list
This is a test unordered list with mixed bullets:
* First item with a number 2. in it
* Second item
* Third item
** Indented item
** Indented item
* Fourth item
Another unordered list:
* 1st item
* 2nd item
* third item containing _italic_ text
** indented item
** second indented item
* fourth item
This is a test ordered list with indented items:
. First item
. Second item
. Third item
.. Indented item
.. Indented item
. Fourth item
Ordered list where everything has no number:
. First item
. Second item
. Third item
. Fourth item
This is a mixed list with indented items:
. First item
. Second item
. Third item
* Indented item
* Indented item
. Fourth item
This is another mixed list with indented items:
* First item
* Second item
* Third item
. Indented item
. Indented item
* Fourth item
== Headers
=== Third-level header
==== Fourth-level header
===== Fifth-level header
====== Sixth-level header
== Media and Links
=== Nostr address
This should be ignored and rendered as plaintext: naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
This is also plaintext:
npub1gv069u6q7zkl393ad47xutpqmyfj0rrfrlnqnlfc2ld38k8nnl4st9wa6q
These should be turned into links:
nostr:naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
nostr:npub1l5sga6xg72phsz5422ykujprejwud075ggrr3z2hwyrfgr7eylqstegx9z
nostr:nevent1qvzqqqqqqypzp382htsmu08k277ps40wqhnfm60st89h5pvjyutghq9cjasuh38qqythwumn8ghj7un9d3shjtnswf5k6ctv9ehx2ap0qqsysletg3lqnl4uy59xsj4rp9rgw67wg23l827f4uvn5ckn20fuxcq45d8pj
nostr:nprofile1qqsxhedgkuneycxpcdjlg6tgtxdy8gurdz64nq2h0flc288a0jag98qguy3nh
nostr:note1txyefcha2xt3pgungx4k6j077dsteyef6hzpyuuku00s4h0eymzq4k33yg
=== Hashtag
#testhashtag at the start of the line and #inlinehashtag in the middle
=== Wikilinks
[[NKBIP-01|Specification]] and [[mirepoix]]
=== URL
https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html
link:https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html[Welt Online link]
this should render as plaintext: `http://www.example.com`
this should be a hyperlink to the http URL with the same address link:https://theforest.nostr1.com[wss://theforest.nostr1.com]
=== Images
https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
image::https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png[width=400]
=== Media
==== YouTube
Normal
https://www.youtube.com/watch?v=KGIAS0cslSU
https://youtu.be/KGIAS0cslSU
video::KGIAS0cslSU[youtube]
Shorts
https://www.youtube.com/shorts/s-BQhXdCs8Y
video::s-BQhXdCs8Y[youtube]
==== Spotify
https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx
link:https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx[]
==== Audio
https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3
audio::https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3[]
==== Video
https://v.nostr.build/MTjaYib4upQuf8zn.mp4
video::https://v.nostr.build/MTjaYib4upQuf8zn.mp4[]
== Tables
=== Orderly
[cols="1,2"]
|===
|Syntax|Description
|Header
|Title
|Paragraph
|Text
|===
=== Unorderly
[cols="1,2"]
|===
|Syntax|Description
|Header
|Title
|Paragraph
|Text
|===
=== With alignment
[cols="<,^,>"]
|===
|Syntax|Description|Test Text
|Header
|Title
|Here's this
|Paragraph
|Text
|And more
|===
== Code blocks
=== json
[source,json]
----
{
"id": "<event_id>",
"pubkey": "<event_originator_pubkey>",
"created_at": 1725087283,
"kind": 30040,
"tags": [
["d", "aesop's-fables-by-aesop"],
["title", "Aesop's Fables"],
["author", "Aesop"],
],
"sig": "<event_signature>"
}
----
=== typescript
[source,typescript]
----
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
----
=== shell
[source,shell]
----
mkdir new_directory
cp source.txt destination.txt
----
=== LaTeX
[source,latex]
----
$$
M =
\begin{bmatrix}
\frac{5}{6} & \frac{1}{6} & 0 \\[0.3em]
\frac{5}{6} & 0 & \frac{1}{6} \\[0.3em]
0 & \frac{5}{6} & \frac{1}{6}
\end{bmatrix}
$$
----
[source,latex]
----
$$
f(x)=
\begin{cases}
1/d_{ij} & \quad \text{when $d_{ij} \leq 160$}\\
0 & \quad \text{otherwise}
\end{cases}
$$
----
=== ABC Notation
[abc]
----
X:1
T:Ohne Titel
C:Aufgezeichnet 1784
A:Seibis nahe Lichtenberg in Oberfranken
S:Handschrift, bezeichnet und datiert: "Heinrich Nicol Philipp zu Seibis den 30 Junius 1784"
M:4/4
L:1/4
K:D
dd d2 | ee e2 | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
|:\
fg ad | cB cA | fg ad | cB cA |\
dd d2 | ee e2 | fg ad | ed/c/ d2 :|
----
=== PlantUML
[plantuml]
----
@startuml
Alice -> Bob: Authentication Request
Bob --> Alice: Authentication Response
@enduml
----
=== BPMN
[plantuml]
----
@startbpmn
start
:Task 1;
:Task 2;
stop
@endbpmn
----
== LaTeX
=== LaTeX in inline-code
`$[ x^n + y^n = z^n \]$` and `$[\sqrt{x^2+1}\]$` and `$\color{blue}{X \sim Normal \; (\mu,\sigma^2)}$`
== Footnotes
Here's a simple footnote,footnote:[This is the first footnote.] and here's a longer one.footnote:[Here's one with multiple paragraphs and code.]
== Anchor links
<<_bullet_list,Link to bullet list section>>
== Formatting
=== Strikethrough
[line-through]#The world is flat.# We now know that the world is round. This should not be ~struck~ through.
=== Bold
This is *bold* text. So is this *bold* text.
=== Italic
This is _italic_ text. So is this _italic_ text.
=== Task List
* [x] Write the press release
* [ ] Update the website
* [ ] Contact the media
=== Emoji shortcodes
Gone camping! :tent: Be back soon.
That is so funny! :joy:
=== Marking and highlighting text
I need to highlight these [highlight]#very important words#.
=== Subscript and Superscript
H~2~O
X^2^
=== Delimiter
based upon a single quote
'''
based upon a dashes
---
=== Quotes
[quote]
____
This is a single line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
____
[quote]
____
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
____

55
example.js

@ -0,0 +1,55 @@ @@ -0,0 +1,55 @@
#!/usr/bin/env node
/**
* Example usage of gc-parser
* This can be called from Go or used directly in Node.js
*/
const { Parser, defaultOptions } = require('./dist/index.js');
async function main() {
// Create parser with default options
const opts = defaultOptions();
opts.linkBaseURL = process.env.LINK_BASE_URL || 'https://example.com';
const parser = new Parser(opts);
// Get content from command line argument or stdin
let content = '';
if (process.argv[2]) {
content = process.argv[2];
} else {
// Read from stdin
const readline = require('readline');
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
for await (const line of rl) {
content += line + '\n';
}
}
if (!content) {
console.error('No content provided');
process.exit(1);
}
try {
const result = await parser.process(content);
// Output as JSON for easy parsing
console.log(JSON.stringify(result, null, 2));
} catch (error) {
console.error('Error processing content:', error);
process.exit(1);
}
}
if (require.main === module) {
main();
}
module.exports = { main };

23
jest.config.js

@ -1,23 +0,0 @@ @@ -1,23 +0,0 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
roots: ['<rootDir>/src'],
testMatch: ['**/__tests__/**/*.test.ts', '**/?(*.)+(spec|test).ts'],
testPathIgnorePatterns: ['/node_modules/', '/dist/', 'asciidoc.test.ts'],
transform: {
'^.+\\.ts$': ['ts-jest', {
tsconfig: {
esModuleInterop: true,
},
}],
'^.+\\.js$': 'babel-jest',
},
moduleFileExtensions: ['ts', 'js', 'json'],
moduleNameMapper: {
'^marked$': '<rootDir>/node_modules/marked/lib/marked.umd.js',
},
collectCoverageFrom: [
'src/**/*.ts',
'!src/**/*.d.ts',
],
};

277
markdown_testdoc.md

@ -1,277 +0,0 @@ @@ -1,277 +0,0 @@
---
# this is YAML front matter
author: James Smith
summary: This is a summary
topics: list, of, topics
variable: one
array:
- one thing
- two things
- several things
# all of this data is available to our layout
---
# Markdown Test Document
## Bullet list
This is a test unordered list with mixed bullets:
* First item with a number 2. in it
* Second item
* Third item
- Indented item
- Indented item
* Fourth item
Another unordered list:
- 1st item
- 2nd item
- third item containing _italic_ text
- indented item
- second indented item
- fourth item
This is a test ordered list with indented items:
1. First item
2. Second item
3. Third item
1. Indented item
2. Indented item
4. Fourth item
Ordered list that is wrongly numbered:
1. First item
8. Second item
3. Third item
5. Fourth item
This is a mixed list with indented items:
1. First item
2. Second item
3. Third item
* Indented item
* Indented item
4. Fourth item
This is another mixed list with indented items:
- First item
- Second item
- Third item
1. Indented item
2. Indented item
- Fourth item
## Headers
### Third-level header
#### Fourth-level header
##### Fifth-level header
###### Sixth-level header
## Media and Links
### Nostr address
This should be ignored and rendered as plaintext: naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
This is also plaintext:
npub1gv069u6q7zkl393ad47xutpqmyfj0rrfrlnqnlfc2ld38k8nnl4st9wa6q
These should be turned into links:
nostr:naddr1qvzqqqr4gupzplfq3m5v3u5r0q9f255fdeyz8nyac6lagssx8zy4wugxjs8ajf7pqyghwumn8ghj7mn0wd68ytnvv9hxgtcqy4sj6ar9wd6xv6tvv5kkvmmj94kkzuntv3hhwm3dvfuj6enyxgcrset98p3nsve2v5l
nostr:npub1l5sga6xg72phsz5422ykujprejwud075ggrr3z2hwyrfgr7eylqstegx9z
nostr:nevent1qvzqqqqqqypzp382htsmu08k277ps40wqhnfm60st89h5pvjyutghq9cjasuh38qqythwumn8ghj7un9d3shjtnswf5k6ctv9ehx2ap0qqsysletg3lqnl4uy59xsj4rp9rgw67wg23l827f4uvn5ckn20fuxcq45d8pj
nostr:nprofile1qqsxhedgkuneycxpcdjlg6tgtxdy8gurdz64nq2h0flc288a0jag98qguy3nh
nostr:note1txyefcha2xt3pgungx4k6j077dsteyef6hzpyuuku00s4h0eymzq4k33yg
### Hashtag
#testhashtag at the start of the line and #inlinehashtag in the middle
### Wikilinks
[[NKBIP-01|Specification]] and [[mirepoix]]
### URL
https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html
[Welt Online link](https://www.welt.de/politik/ausland/article69a7ca00ad41f3cd65a1bc63/iran-drohte-jedes-schiff-zu-verbrennen-trump-will-oel-tanker-durch-strasse-von-hormus-eskortieren.html)
this should render as plaintext: `http://www.example.com`
this shouild be a hyperlink to the http URL with the same address [wss://theforest.nostr1.com](https://theforest.nostr1.com)
### Images
https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png
![test image](https://blog.ronin.cloud/content/images/size/w2000/2022/02/markdown.png)
### Media
#### YouTube
https://youtube.com/shorts/ZWfvChb-i0w
#### Spotify
https://open.spotify.com/episode/1GSZFA8vWltPyxYkArdRKx?si=bq6-az28TcuP596feTkRFQ
#### Audio
https://media.blubrry.com/takeituneasy/ins.blubrry.com/takeituneasy/lex_ai_rick_beato.mp3
#### Video
https://v.nostr.build/MTjaYib4upQuf8zn.mp4
## Tables
### Orderly
| Syntax | Description |
| ----------- | ----------- |
| Header | Title |
| Paragraph | Text |
### Unorderly
| Syntax | Description |
| --- | ----------- |
| Header | Title |
| Paragraph | Text |
## Code blocks
### json
```json
{
"id": "<event_id>",
"pubkey": "<event_originator_pubkey>",
"created_at": 1725087283,
"kind": 30040,
"tags": [
["d", "aesop's-fables-by-aesop"],
["title", "Aesop's Fables"],
["author", "Aesop"],
],
"sig": "<event_signature>"
}
```
### typescript
```typescript
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
```
### shell
```shell
mkdir new_directory
cp source.txt destination.txt
```
### LaTeX
```latex
$$
M =
\begin{bmatrix}
\frac{5}{6} & \frac{1}{6} & 0 \\[0.3em]
\frac{5}{6} & 0 & \frac{1}{6} \\[0.3em]
0 & \frac{5}{6} & \frac{1}{6}
\end{bmatrix}
$$
```
## LateX
### LaTex in inline-code
`$[ x^n + y^n = z^n \]$` and `$[\sqrt{x^2+1}\]$` and `$\color{blue}{X \sim Normal \; (\mu,\sigma^2)}$`
## Footnotes
Here's a simple footnote,[^1] and here's a longer one.[^bignote]
[^1]: This is the first footnote.
[^bignote]: Here's one with multiple paragraphs and code.
## Anchor links
[Link to bullet list section](#bullet-list)
## Formatting
### Strikethrough
~~The world is flat.~~ We now know that the world is round.
### Bold
This is *italic* text. So is this **bold** text.
### Task List
- [x] Write the press release
- [ ] Update the website
- [ ] Contact the media
### Emoji shortcodes
Gone camping! :tent: Be back soon.
That is so funny! :joy:
### Subscript and Superscript
X^2^
### Delimiter
based upon a -
---
based upon a *
***
### Quotes
> This is a single line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
> This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
> This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj
> This is a multi line blockequote sdfjsdlfkjasldkfjsdölfkjsdlfkjsadlöfkjsdlöfkjsadölfkjsdlf kjsldfkjsdalkjslkdfjlöskdfjlösdkjfsldkfjsöldkfjlösdkfjalsd kfjlsdkfjlödkfjlaksdfjlkjdfslkjalsdkfjlasdkfj alsdkjflskdfj sdfklj

18
package.json

@ -7,9 +7,6 @@ @@ -7,9 +7,6 @@
"scripts": {
"build": "tsc",
"test": "jest",
"test:asciidoc": "ts-node src/__tests__/asciidoc.test.ts",
"test:all": "npm run test && npm run test:asciidoc",
"test:report": "ts-node generate-test-report.ts",
"prepublishOnly": "npm run build"
},
"keywords": [
@ -24,18 +21,13 @@ @@ -24,18 +21,13 @@
"author": "",
"license": "MIT",
"dependencies": {
"@asciidoctor/core": "^3.0.4",
"@types/marked": "^5.0.2",
"marked": "^17.0.3",
"node-emoji": "^2.2.0"
"@asciidoctor/core": "^3.0.4"
},
"devDependencies": {
"@types/highlight.js": "^10.1.0",
"@types/jest": "^29.5.11",
"@types/node": "^20.19.35",
"@types/node": "^20.11.0",
"typescript": "^5.3.3",
"jest": "^29.7.0",
"ts-jest": "^29.4.6",
"ts-node": "^10.9.2",
"typescript": "^5.3.3"
"@types/jest": "^29.5.11",
"@types/highlight.js": "^10.1.0"
}
}

353
src/__tests__/asciidoc.test.ts

@ -1,353 +0,0 @@ @@ -1,353 +0,0 @@
import { Parser } from '../parser';
import { readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join } from 'path';
/**
* Simple test runner for AsciiDoc tests (separate from Jest due to Opal compatibility issues)
*/
async function runAsciiDocTests() {
console.log('Running AsciiDoc tests...\n');
const asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8');
const parser = new Parser({
linkBaseURL: 'https://example.com',
enableNostrAddresses: true,
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/hashtag/{topic}'
});
let passed = 0;
let failed = 0;
const failures: string[] = [];
const testPromises: Promise<void>[] = [];
function test(name: string, fn: () => void | Promise<void>) {
const testPromise = (async () => {
try {
const result = fn();
if (result instanceof Promise) {
await result;
}
passed++;
console.log(`${name}`);
} catch (error: any) {
failed++;
failures.push(`${name}: ${error.message}`);
console.error(`${name}: ${error.message}`);
}
})();
testPromises.push(testPromise);
}
function expect(actual: any) {
return {
toBeDefined: () => {
if (actual === undefined || actual === null) {
throw new Error(`Expected value to be defined, but got ${actual}`);
}
},
toBe: (expected: any) => {
if (actual !== expected) {
throw new Error(`Expected ${expected}, but got ${actual}`);
}
},
toContain: (substring: string) => {
if (typeof actual === 'string' && !actual.includes(substring)) {
throw new Error(`Expected string to contain "${substring}"`);
}
},
toMatch: (regex: RegExp) => {
if (typeof actual === 'string' && !regex.test(actual)) {
throw new Error(`Expected string to match ${regex}`);
}
},
toHaveProperty: (prop: string) => {
if (!(prop in actual)) {
throw new Error(`Expected object to have property "${prop}"`);
}
},
toBeGreaterThan: (value: number) => {
if (typeof actual !== 'number' || actual <= value) {
throw new Error(`Expected ${actual} to be greater than ${value}`);
}
},
length: {
toBeGreaterThan: (value: number) => {
if (!Array.isArray(actual) || actual.length <= value) {
throw new Error(`Expected array length to be greater than ${value}, but got ${actual.length}`);
}
}
}
};
}
// Run tests
const result = await parser.process(asciidocContent);
// Write HTML output to file for inspection
const outputDir = join(__dirname, '../../test-output');
try {
mkdirSync(outputDir, { recursive: true });
} catch (e) {
// Directory might already exist
}
const htmlOutput = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="referrer" content="strict-origin-when-cross-origin">
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'unsafe-inline' 'unsafe-eval' https://www.youtube.com https://s.ytimg.com https://www.gstatic.com https://*.googlevideo.com; frame-src https://www.youtube.com https://youtube.com https://open.spotify.com https://*.googlevideo.com; style-src 'unsafe-inline'; img-src 'self' data: https:; media-src 'self' https:; connect-src https:; child-src https://www.youtube.com https://youtube.com;">
<title>AsciiDoc Test Output</title>
<style>
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; }
.hashtag { color: #1da1f2; font-weight: 500; }
.wikilink { color: #0066cc; text-decoration: underline; }
.nostr-link { color: #8b5cf6; text-decoration: underline; }
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; }
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; }
.line-through { text-decoration: line-through; }
.highlight { background-color: #ffeb3b; padding: 2px 4px; border-radius: 3px; }
.bare-image { max-width: 100%; width: auto; height: auto; margin: 10px 0; display: block; }
.bare-video, .bare-audio { width: 100%; max-width: 800px; margin: 10px 0; display: block; }
.youtube-embed, .spotify-embed { max-width: 100%; margin: 10px 0; border-radius: 8px; display: block; }
.youtube-embed { width: 100%; max-width: 640px; height: auto; aspect-ratio: 16/9; border: 0; display: block; }
.spotify-embed { width: 100%; max-width: 800px; }
/* Table styles */
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
table thead { background-color: #f2f2f2; }
table th { font-weight: bold; padding: 8px; border: 1px solid #ddd; background-color: #f2f2f2; }
table td { padding: 8px; border: 1px solid #ddd; }
/* Alignment classes - AsciiDoc uses halign-* and valign-* classes */
.halign-left { text-align: left !important; }
.halign-center { text-align: center !important; }
.halign-right { text-align: right !important; }
.valign-top { vertical-align: top !important; }
.valign-middle { vertical-align: middle !important; }
.valign-bottom { vertical-align: bottom !important; }
/* Also handle tableblock classes */
.tableblock.halign-left { text-align: left !important; }
.tableblock.halign-center { text-align: center !important; }
.tableblock.halign-right { text-align: right !important; }
.tableblock.valign-top { vertical-align: top !important; }
.tableblock.valign-middle { vertical-align: middle !important; }
.tableblock.valign-bottom { vertical-align: bottom !important; }
/* Task list styles */
.checklist { list-style: none; padding-left: 0; }
.checklist li { padding-left: 1.5em; position: relative; margin: 0.5em 0; }
.checklist li i.fa-check-square-o::before { content: "☑ "; font-style: normal; font-family: sans-serif; }
.checklist li i.fa-square-o::before { content: "☐ "; font-style: normal; font-family: sans-serif; }
.checklist li i { position: absolute; left: 0; font-style: normal; }
/* Fallback if Font Awesome doesn't load */
.checklist li i.fa-check-square-o { display: inline-block; width: 1em; }
.checklist li i.fa-check-square-o:before { content: "☑"; }
.checklist li i.fa-square-o { display: inline-block; width: 1em; }
.checklist li i.fa-square-o:before { content: "☐"; }
/* AsciiDoc specific styles */
.sect1, .sect2, .sect3, .sect4, .sect5 { margin-top: 1.5em; margin-bottom: 1em; }
.paragraph { margin: 1em 0; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
table th { background-color: #f2f2f2; }
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; }
</style>
</head>
<body>
<h1>AsciiDoc Test Document - Parsed Output</h1>
<hr>
${result.content}
<hr>
<h2>Metadata</h2>
<pre>${JSON.stringify({
hasLaTeX: result.hasLaTeX,
hasMusicalNotation: result.hasMusicalNotation,
nostrLinks: result.nostrLinks,
wikilinks: result.wikilinks,
hashtags: result.hashtags,
links: result.links,
media: result.media
}, null, 2)}</pre>
</body>
</html>`;
const outputPath = join(outputDir, 'asciidoc-output.html');
writeFileSync(outputPath, htmlOutput, 'utf-8');
console.log(`\n📄 HTML output written to: ${outputPath}\n`);
test('should parse AsciiDoc content', () => {
expect(result).toBeDefined();
expect(result.content).toBeDefined();
expect(typeof result.content).toBe('string');
expect(result.content.length).toBeGreaterThan(0);
});
test('should have HTML content', () => {
expect(result.content).toContain('<');
expect(result.content).toContain('>');
});
test('should extract table of contents', () => {
expect(result.tableOfContents).toBeDefined();
expect(typeof result.tableOfContents).toBe('string');
});
test('should detect LaTeX', () => {
expect(result.hasLaTeX).toBeDefined();
expect(typeof result.hasLaTeX).toBe('boolean');
expect(result.hasLaTeX).toBe(true);
});
test('should detect musical notation', () => {
expect(result.hasMusicalNotation).toBeDefined();
expect(typeof result.hasMusicalNotation).toBe('boolean');
expect(result.hasMusicalNotation).toBe(true);
});
test('should extract nostr links', () => {
expect(result.nostrLinks).toBeDefined();
expect(Array.isArray(result.nostrLinks)).toBe(true);
expect(result.nostrLinks.length).toBeGreaterThan(0);
const nostrLink = result.nostrLinks[0];
expect(nostrLink).toHaveProperty('type');
expect(nostrLink).toHaveProperty('id');
expect(nostrLink).toHaveProperty('text');
expect(nostrLink).toHaveProperty('bech32');
const validTypes = ['npub', 'nprofile', 'nevent', 'naddr', 'note'];
if (!validTypes.includes(nostrLink.type)) {
throw new Error(`Invalid nostr type: ${nostrLink.type}`);
}
});
test('should extract wikilinks', () => {
expect(result.wikilinks).toBeDefined();
expect(Array.isArray(result.wikilinks)).toBe(true);
expect(result.wikilinks.length).toBeGreaterThan(0);
const wikilink = result.wikilinks[0];
expect(wikilink).toHaveProperty('dtag');
expect(wikilink).toHaveProperty('display');
expect(wikilink).toHaveProperty('original');
});
test('should extract hashtags', () => {
expect(result.hashtags).toBeDefined();
expect(Array.isArray(result.hashtags)).toBe(true);
expect(result.hashtags.length).toBeGreaterThan(0);
result.hashtags.forEach((tag: string) => {
if (tag.includes('#')) {
throw new Error(`Hashtag should not include #: ${tag}`);
}
});
});
test('should extract regular links', () => {
expect(result.links).toBeDefined();
expect(Array.isArray(result.links)).toBe(true);
if (result.links.length > 0) {
const link = result.links[0];
expect(link).toHaveProperty('url');
expect(link).toHaveProperty('text');
expect(link).toHaveProperty('isExternal');
expect(typeof link.isExternal).toBe('boolean');
}
});
test('should extract media URLs', () => {
expect(result.media).toBeDefined();
expect(Array.isArray(result.media)).toBe(true);
});
test('should process nostr: addresses in HTML', () => {
const nostrAddresses = result.nostrLinks;
expect(nostrAddresses.length).toBeGreaterThan(0);
nostrAddresses.forEach((link: any) => {
if (!result.content.includes(`data-nostr-type="${link.type}"`)) {
throw new Error(`Missing nostr type attribute for ${link.type}`);
}
if (!result.content.includes(`data-nostr-id="${link.bech32}"`)) {
throw new Error(`Missing nostr id attribute for ${link.bech32}`);
}
});
});
test('should process wikilinks in HTML', () => {
const wikilinks = result.wikilinks;
expect(wikilinks.length).toBeGreaterThan(0);
wikilinks.forEach((wikilink: any) => {
if (!result.content.includes(`class="wikilink"`)) {
throw new Error('Missing wikilink class');
}
if (!result.content.includes(`data-dtag="${wikilink.dtag}"`)) {
throw new Error(`Missing dtag attribute for ${wikilink.dtag}`);
}
});
});
test('should process hashtags in HTML', () => {
const hashtags = result.hashtags;
expect(hashtags.length).toBeGreaterThan(0);
hashtags.forEach((tag: string) => {
if (!result.content.includes(`data-topic="${tag}"`)) {
throw new Error(`Missing topic attribute for ${tag}`);
}
if (!result.content.includes('class="hashtag"')) {
throw new Error('Missing hashtag class');
}
});
});
test('should contain expected content sections', () => {
if (!/Bullet list|bullet/i.test(result.content)) {
throw new Error('Missing bullet list section');
}
if (!/Headers|header/i.test(result.content)) {
throw new Error('Missing headers section');
}
if (!/Media and Links|media|links/i.test(result.content)) {
throw new Error('Missing media and links section');
}
});
test('should return consistent structure', () => {
expect(result).toHaveProperty('content');
expect(result).toHaveProperty('tableOfContents');
expect(result).toHaveProperty('hasLaTeX');
expect(result).toHaveProperty('hasMusicalNotation');
expect(result).toHaveProperty('nostrLinks');
expect(result).toHaveProperty('wikilinks');
expect(result).toHaveProperty('hashtags');
expect(result).toHaveProperty('links');
expect(result).toHaveProperty('media');
});
// Wait for all tests to complete
await Promise.all(testPromises);
// Print summary
console.log(`\n${'='.repeat(50)}`);
console.log(`Tests passed: ${passed}`);
console.log(`Tests failed: ${failed}`);
if (failures.length > 0) {
console.log('\nFailures:');
failures.forEach(f => console.error(` - ${f}`));
process.exit(1);
} else {
console.log('\nAll tests passed!');
process.exit(0);
}
}
// Run tests
runAsciiDocTests().catch(error => {
console.error('Test runner error:', error);
process.exit(1);
});

238
src/__tests__/parser.test.ts

@ -1,238 +0,0 @@ @@ -1,238 +0,0 @@
import { Parser } from '../parser';
import { readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join } from 'path';
describe('Parser', () => {
let asciidocContent: string;
let markdownContent: string;
beforeAll(() => {
asciidocContent = readFileSync(join(__dirname, '../../asciidoc_testdoc.adoc'), 'utf-8');
markdownContent = readFileSync(join(__dirname, '../../markdown_testdoc.md'), 'utf-8');
});
// AsciiDoc tests are run separately using a Node.js script (asciidoc.test.ts)
// due to Jest/Opal runtime compatibility issues
// Run with: npm run test:asciidoc
describe('Markdown Test Document', () => {
let result: any;
beforeAll(async () => {
const parser = new Parser({
linkBaseURL: 'https://example.com',
enableNostrAddresses: true,
wikilinkUrl: '/events?d={dtag}',
hashtagUrl: '/hashtag/{topic}'
});
result = await parser.process(markdownContent);
// Write HTML output to file for inspection
const outputDir = join(__dirname, '../../test-output');
try {
mkdirSync(outputDir, { recursive: true });
} catch (e) {
// Directory might already exist
}
const htmlOutput = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Markdown Test Output</title>
<style>
body { font-family: sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; line-height: 1.6; }
.hashtag { color: #1da1f2; font-weight: 500; }
.wikilink { color: #0066cc; text-decoration: underline; }
.nostr-link { color: #8b5cf6; text-decoration: underline; }
pre { background: #f5f5f5; padding: 10px; border-radius: 4px; overflow-x: auto; }
code { background: #f5f5f5; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; }
.bare-image, .bare-video, .bare-audio { max-width: 100%; margin: 10px 0; }
.bare-video, .bare-audio { width: 100%; max-width: 600px; }
blockquote { border-left: 4px solid #ddd; padding-left: 1em; margin: 1em 0; color: #666; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
table th, table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
table th { background-color: #f2f2f2; }
</style>
</head>
<body>
<h1>Markdown Test Document - Parsed Output</h1>
<hr>
${result.content}
<hr>
<h2>Metadata</h2>
<pre>${JSON.stringify({
frontmatter: result.frontmatter,
hasLaTeX: result.hasLaTeX,
hasMusicalNotation: result.hasMusicalNotation,
nostrLinks: result.nostrLinks,
wikilinks: result.wikilinks,
hashtags: result.hashtags,
links: result.links,
media: result.media
}, null, 2)}</pre>
</body>
</html>`;
const outputPath = join(outputDir, 'markdown-output.html');
writeFileSync(outputPath, htmlOutput, 'utf-8');
// Use console.info to ensure it shows in Jest output
console.info(`\n📄 HTML output written to: ${outputPath}\n`);
});
it('should parse Markdown content', () => {
expect(result).toBeDefined();
expect(result.content).toBeDefined();
expect(typeof result.content).toBe('string');
expect(result.content.length).toBeGreaterThan(0);
});
it('should have HTML content', () => {
expect(result.content).toContain('<');
expect(result.content).toContain('>');
});
it('should extract frontmatter', () => {
expect(result.frontmatter).toBeDefined();
expect(typeof result.frontmatter).toBe('object');
expect(result.frontmatter).toHaveProperty('author');
expect(result.frontmatter.author).toBe('James Smith');
expect(result.frontmatter).toHaveProperty('summary');
expect(result.frontmatter.summary).toBe('This is a summary');
});
it('should detect LaTeX', () => {
expect(result.hasLaTeX).toBeDefined();
expect(typeof result.hasLaTeX).toBe('boolean');
// The test doc has LaTeX, so it should be true
expect(result.hasLaTeX).toBe(true);
});
it('should detect musical notation', () => {
expect(result.hasMusicalNotation).toBeDefined();
expect(typeof result.hasMusicalNotation).toBe('boolean');
});
it('should extract nostr links', () => {
expect(result.nostrLinks).toBeDefined();
expect(Array.isArray(result.nostrLinks)).toBe(true);
expect(result.nostrLinks.length).toBeGreaterThan(0);
// Check that nostr: addresses are extracted
const nostrLink = result.nostrLinks[0];
expect(nostrLink).toHaveProperty('type');
expect(nostrLink).toHaveProperty('id');
expect(nostrLink).toHaveProperty('text');
expect(nostrLink).toHaveProperty('bech32');
expect(['npub', 'nprofile', 'nevent', 'naddr', 'note']).toContain(nostrLink.type);
});
it('should extract wikilinks', () => {
expect(result.wikilinks).toBeDefined();
expect(Array.isArray(result.wikilinks)).toBe(true);
expect(result.wikilinks.length).toBeGreaterThan(0);
// Check wikilink structure
const wikilink = result.wikilinks[0];
expect(wikilink).toHaveProperty('dtag');
expect(wikilink).toHaveProperty('display');
expect(wikilink).toHaveProperty('original');
});
it('should extract hashtags', () => {
expect(result.hashtags).toBeDefined();
expect(Array.isArray(result.hashtags)).toBe(true);
expect(result.hashtags.length).toBeGreaterThan(0);
// Hashtags should not include the # symbol
result.hashtags.forEach((tag: string) => {
expect(tag).not.toContain('#');
});
});
it('should extract regular links', () => {
expect(result.links).toBeDefined();
expect(Array.isArray(result.links)).toBe(true);
if (result.links.length > 0) {
const link = result.links[0];
expect(link).toHaveProperty('url');
expect(link).toHaveProperty('text');
expect(link).toHaveProperty('isExternal');
expect(typeof link.isExternal).toBe('boolean');
}
});
it('should extract media URLs', () => {
expect(result.media).toBeDefined();
expect(Array.isArray(result.media)).toBe(true);
});
it('should process nostr: addresses in HTML', () => {
// Check that nostr: addresses are converted to links
const nostrAddresses = result.nostrLinks;
expect(nostrAddresses.length).toBeGreaterThan(0);
// Check that HTML contains links for nostr addresses
nostrAddresses.forEach((link: any) => {
expect(result.content).toContain(`data-nostr-type="${link.type}"`);
expect(result.content).toContain(`data-nostr-id="${link.bech32}"`);
});
});
it('should process wikilinks in HTML', () => {
// Check that wikilinks are converted to links
const wikilinks = result.wikilinks;
expect(wikilinks.length).toBeGreaterThan(0);
wikilinks.forEach((wikilink: any) => {
expect(result.content).toContain(`class="wikilink"`);
expect(result.content).toContain(`data-dtag="${wikilink.dtag}"`);
});
});
it('should process hashtags in HTML', () => {
// Check that hashtags are processed
const hashtags = result.hashtags;
expect(hashtags.length).toBeGreaterThan(0);
hashtags.forEach((tag: string) => {
expect(result.content).toContain(`data-topic="${tag}"`);
expect(result.content).toMatch(new RegExp(`class="hashtag"`));
});
});
it('should contain expected content sections', () => {
// Check for some expected content from the test doc
expect(result.content).toMatch(/Bullet list|bullet/i);
expect(result.content).toMatch(/Headers|header/i);
expect(result.content).toMatch(/Media and Links|media|links/i);
});
it('should have empty table of contents for markdown', () => {
// Markdown doesn't generate TOC by default
expect(result.tableOfContents).toBeDefined();
expect(typeof result.tableOfContents).toBe('string');
});
});
describe('Result structure validation', () => {
it('should return consistent structure for Markdown', async () => {
const parser = new Parser();
const result = await parser.process(markdownContent);
// Check all required fields
expect(result).toHaveProperty('content');
expect(result).toHaveProperty('tableOfContents');
expect(result).toHaveProperty('hasLaTeX');
expect(result).toHaveProperty('hasMusicalNotation');
expect(result).toHaveProperty('nostrLinks');
expect(result).toHaveProperty('wikilinks');
expect(result).toHaveProperty('hashtags');
expect(result).toHaveProperty('links');
expect(result).toHaveProperty('media');
});
});
});

332
src/converters/to-asciidoc.ts

@ -0,0 +1,332 @@ @@ -0,0 +1,332 @@
import { ContentFormat } from '../types';
export interface ConvertOptions {
enableNostrAddresses?: boolean;
}
/**
* Converts content to AsciiDoc format based on detected format
* This is the unified entry point - everything becomes AsciiDoc
*/
export function convertToAsciidoc(
content: string,
format: ContentFormat,
linkBaseURL: string,
options: ConvertOptions = {}
): string {
let asciidoc = '';
switch (format) {
case ContentFormat.AsciiDoc:
// For AsciiDoc content, ensure proper formatting
asciidoc = content.replace(/\\n/g, '\n');
// Ensure headers are on their own lines with proper spacing
asciidoc = asciidoc.replace(/(\S[^\n]*)\n(={1,6}\s+[^\n]+)/g, (_match, before, header) => {
return `${before}\n\n${header}`;
});
break;
case ContentFormat.Wikipedia:
asciidoc = convertWikipediaToAsciidoc(content);
break;
case ContentFormat.Markdown:
asciidoc = convertMarkdownToAsciidoc(content);
break;
case ContentFormat.Plain:
default:
asciidoc = convertPlainTextToAsciidoc(content);
break;
}
// Process special elements for all content types
// Process wikilinks
asciidoc = processWikilinks(asciidoc, linkBaseURL);
// Process nostr: addresses if enabled
if (options.enableNostrAddresses !== false) {
asciidoc = processNostrAddresses(asciidoc, linkBaseURL);
}
// Process hashtags
asciidoc = processHashtags(asciidoc);
return asciidoc;
}
/**
* Converts Wikipedia markup to AsciiDoc format
* Handles Wikipedia-style headings, links, and formatting
*/
function convertWikipediaToAsciidoc(content: string): string {
let asciidoc = content.replace(/\\n/g, '\n');
// Convert Wikipedia headings: == Heading == to AsciiDoc == Heading
// Wikipedia uses == for level 2, === for level 3, etc.
// AsciiDoc uses = for title, == for level 1, === for level 2, etc.
// So Wikipedia level 2 (==) maps to AsciiDoc level 1 (==)
asciidoc = asciidoc.replace(/^(=+)\s+(.+?)\s+\1$/gm, (match, equals, heading) => {
const level = equals.length - 1; // Count = signs, subtract 1 for AsciiDoc mapping
const asciidocEquals = '='.repeat(level + 1); // AsciiDoc uses one more = for same level
return `${asciidocEquals} ${heading.trim()}`;
});
// Convert Wikipedia bold: ''text'' to AsciiDoc *text*
asciidoc = asciidoc.replace(/''([^']+)''/g, '*$1*');
// Convert Wikipedia italic: 'text' to AsciiDoc _text_
// Be careful not to match apostrophes in words
asciidoc = asciidoc.replace(/(^|[^'])'([^']+)'([^']|$)/g, '$1_$2_$3');
// Convert Wikipedia links: [[Page]] or [[Page|Display]] to wikilinks
// These will be processed by processWikilinks later, but we need to ensure
// they're in the right format. Wikipedia links are already in [[...]] format
// which matches our wikilink format, so they should work as-is.
// Convert Wikipedia external links: [URL text] to AsciiDoc link:URL[text]
asciidoc = asciidoc.replace(/\[(https?:\/\/[^\s\]]+)\s+([^\]]+)\]/g, 'link:$1[$2]');
asciidoc = asciidoc.replace(/\[(https?:\/\/[^\s\]]+)\]/g, 'link:$1[$1]');
// Convert Wikipedia lists (they use * or # similar to Markdown)
// This is handled similarly to Markdown, so we can reuse that logic
// But Wikipedia also uses : for definition lists and ; for term lists
// For now, we'll handle basic lists and let AsciiDoc handle the rest
// Convert horizontal rules: ---- to AsciiDoc '''
asciidoc = asciidoc.replace(/^----+$/gm, "'''");
return asciidoc;
}
/**
* Converts Markdown to AsciiDoc format
* Based on jumble's conversion patterns
*/
function convertMarkdownToAsciidoc(content: string): string {
let asciidoc = content.replace(/\\n/g, '\n');
// Fix spacing issues
asciidoc = asciidoc.replace(/`([^`\n]+)`\s*\(([^)]+)\)/g, '`$1` ($2)');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`([a-zA-Z0-9])/g, '$1 `$2` $3');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])`([^`\n]+)`\s*\(/g, '$1 `$2` (');
asciidoc = asciidoc.replace(/\)`([^`\n]+)`([a-zA-Z0-9])/g, ') `$1` $2');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])\)([a-zA-Z0-9])/g, '$1) $2');
asciidoc = asciidoc.replace(/([a-zA-Z0-9])==/g, '$1 ==');
// Note: nostr: addresses are processed later in processNostrAddresses
// Convert headers
asciidoc = asciidoc.replace(/^#{6}\s+(.+)$/gm, '====== $1 ======');
asciidoc = asciidoc.replace(/^#{5}\s+(.+)$/gm, '===== $1 =====');
asciidoc = asciidoc.replace(/^#{4}\s+(.+)$/gm, '==== $1 ====');
asciidoc = asciidoc.replace(/^#{3}\s+(.+)$/gm, '=== $1 ===');
asciidoc = asciidoc.replace(/^#{2}\s+(.+)$/gm, '== $1 ==');
asciidoc = asciidoc.replace(/^#{1}\s+(.+)$/gm, '= $1 =');
asciidoc = asciidoc.replace(/^==\s+(.+?)\s+==$/gm, '== $1 ==');
asciidoc = asciidoc.replace(/\s==\s+([^=]+?)\s+==\s/g, ' == $1 == ');
// Convert emphasis
asciidoc = asciidoc.replace(/\*\*(.+?)\*\*/g, '*$1*'); // Bold
asciidoc = asciidoc.replace(/__(.+?)__/g, '*$1*'); // Bold
asciidoc = asciidoc.replace(/\*(.+?)\*/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/_(.+?)_/g, '_$1_'); // Italic
asciidoc = asciidoc.replace(/~~(.+?)~~/g, '[line-through]#$1#'); // Strikethrough
asciidoc = asciidoc.replace(/~(.+?)~/g, '[subscript]#$1#'); // Subscript
asciidoc = asciidoc.replace(/\^(.+?)\^/g, '[superscript]#$1#'); // Superscript
// Convert code blocks (handle both \n and \r\n line endings)
asciidoc = asciidoc.replace(/```(\w+)?\r?\n([\s\S]*?)\r?\n```/g, (_match, lang, code) => {
const trimmedCode = code.trim();
if (trimmedCode.length === 0) return '';
const hasCodePatterns = /[{}();=<>]|function|class|import|export|def |if |for |while |return |const |let |var |public |private |static |console\.log/.test(trimmedCode);
const isLikelyText = /^[A-Za-z\s.,!?\-'"]+$/.test(trimmedCode) && trimmedCode.length > 50;
const hasTooManySpaces = (trimmedCode.match(/\s{3,}/g) || []).length > 3;
const hasMarkdownPatterns = /^#{1,6}\s|^\*\s|^\d+\.\s|^\>\s|^\|.*\|/.test(trimmedCode);
if ((!hasCodePatterns && trimmedCode.length > 100) || isLikelyText || hasTooManySpaces || hasMarkdownPatterns) {
return _match;
}
return `[source${lang ? ',' + lang : ''}]\n----\n${trimmedCode}\n----`;
});
asciidoc = asciidoc.replace(/`([^`]+)`/g, '`$1`'); // Inline code
asciidoc = asciidoc.replace(/`\$([^$]+)\$`/g, '`$\\$1\\$$`'); // Preserve LaTeX in code
// Convert images
asciidoc = asciidoc.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, 'image::$2[$1,width=100%]');
asciidoc = asciidoc.replace(/image::([^\[]+)\[([^\]]+),width=100%\]/g, 'image::$1[$2,width=100%]');
// Convert links
asciidoc = asciidoc.replace(/\[([^\]]+)\]\(([^)]+)\)/g, 'link:$2[$1]');
// Convert horizontal rules
asciidoc = asciidoc.replace(/^---$/gm, '\'\'\'');
// Convert unordered lists
asciidoc = asciidoc.replace(/^(\s*)\*\s+(.+)$/gm, '$1* $2');
asciidoc = asciidoc.replace(/^(\s*)-\s+(.+)$/gm, '$1* $2');
asciidoc = asciidoc.replace(/^(\s*)\+\s+(.+)$/gm, '$1* $2');
// Convert ordered lists
asciidoc = asciidoc.replace(/^(\s*)\d+\.\s+(.+)$/gm, '$1. $2');
// Convert blockquotes with attribution
asciidoc = asciidoc.replace(/^(>\s+.+(?:\n>\s+.+)*)/gm, (match) => {
const lines = match.split('\n').map(line => line.replace(/^>\s*/, ''));
let quoteBodyLines: string[] = [];
let attributionLine: string | undefined;
for (let i = lines.length - 1; i >= 0; i--) {
const line = lines[i].trim();
if (line.startsWith('—') || line.startsWith('--')) {
attributionLine = line;
quoteBodyLines = lines.slice(0, i);
break;
}
}
const quoteContent = quoteBodyLines.filter(l => l.trim() !== '').join('\n').trim();
if (attributionLine) {
let cleanedAttribution = attributionLine.replace(/^[—-]+/, '').trim();
let author = '';
let source = '';
const linkMatch = cleanedAttribution.match(/^(.*?),?\s*link:([^[\\]]+)\[([^\\]]+)\]$/);
if (linkMatch) {
author = linkMatch[1].trim();
source = `link:${linkMatch[2].trim()}[${linkMatch[3].trim()}]`;
} else {
const parts = cleanedAttribution.split(',').map(p => p.trim());
author = parts[0];
if (parts.length > 1) {
source = parts.slice(1).join(', ').trim();
}
}
return `[quote, ${author}, ${source}]\n____\n${quoteContent}\n____`;
} else {
return `____\n${quoteContent}\n____`;
}
});
// Convert tables
asciidoc = asciidoc.replace(/(\|.*\|[\r\n]+\|[\s\-\|]*[\r\n]+(\|.*\|[\r\n]+)*)/g, (match) => {
const lines = match.trim().split('\n').filter(line => line.trim());
if (lines.length < 2) return match;
const headerRow = lines[0];
const separatorRow = lines[1];
const dataRows = lines.slice(2);
if (!separatorRow.includes('-')) return match;
let tableAsciidoc = '[cols="1,1"]\n|===\n';
tableAsciidoc += headerRow + '\n';
dataRows.forEach(row => {
tableAsciidoc += row + '\n';
});
tableAsciidoc += '|===';
return tableAsciidoc;
});
// Convert footnotes
const footnoteDefinitions: { [id: string]: string } = {};
let tempAsciidoc = asciidoc;
tempAsciidoc = tempAsciidoc.replace(/^\[\^([^\]]+)\]:\s*([\s\S]*?)(?=\n\[\^|\n---|\n##|\n###|\n####|\n#####|\n######|$)/gm, (_, id, text) => {
footnoteDefinitions[id] = text.trim();
return '';
});
asciidoc = tempAsciidoc.replace(/\[\^([^\]]+)\]/g, (match, id) => {
if (footnoteDefinitions[id]) {
return `footnote:[${footnoteDefinitions[id]}]`;
}
return match;
});
return asciidoc;
}
/**
* Converts plain text to AsciiDoc format
* Preserves line breaks by converting single newlines to line continuations
*/
function convertPlainTextToAsciidoc(content: string): string {
// Preserve double newlines (paragraph breaks)
// Convert single newlines to line continuations ( +\n)
return content
.replace(/\r\n/g, '\n') // Normalize line endings
.replace(/\n\n+/g, '\n\n') // Normalize multiple newlines to double
.replace(/([^\n])\n([^\n])/g, '$1 +\n$2'); // Single newlines become line continuations
}
/**
* Normalizes text to d-tag format
*/
function normalizeDtag(text: string): string {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Processes wikilinks: [[target]] or [[target|display text]]
* Converts to WIKILINK: placeholder format to protect from AsciiDoc processing
*/
function processWikilinks(content: string, linkBaseURL: string): string {
// Process bookstr macro wikilinks: [[book::...]]
content = content.replace(/\[\[book::([^\]]+)\]\]/g, (_match, bookContent) => {
const cleanContent = bookContent.trim();
return `BOOKSTR:${cleanContent}`;
});
// Process standard wikilinks: [[Target Page]] or [[target page|see this]]
// Use placeholder format to prevent AsciiDoc from processing the brackets
content = content.replace(/\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g, (_match, target, displayText) => {
const cleanTarget = target.trim();
const cleanDisplay = displayText ? displayText.trim() : cleanTarget;
const dTag = normalizeDtag(cleanTarget);
// Use placeholder format: WIKILINK:dtag|display
// This prevents AsciiDoc from interpreting the brackets
return `WIKILINK:${dTag}|${cleanDisplay}`;
});
return content;
}
/**
* Processes nostr: addresses
* Converts to link:nostr:...[...] format
* Valid bech32 prefixes: npub, nprofile, nevent, naddr, note
*/
function processNostrAddresses(content: string, linkBaseURL: string): string {
// Match nostr: followed by valid bech32 prefix and identifier
// Bech32 format: prefix + separator (1) + data (at least 6 chars for valid identifiers)
const nostrPattern = /nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi;
return content.replace(nostrPattern, (_match, bech32Id) => {
return `link:nostr:${bech32Id}[${bech32Id}]`;
});
}
/**
* Processes hashtags
* Converts to hashtag:tag[#tag] format
*/
function processHashtags(content: string): string {
// Match # followed by word characters, avoiding those in URLs, code blocks, etc.
return content.replace(/\B#([a-zA-Z0-9_]+)/g, (_match, hashtag) => {
const normalizedHashtag = hashtag.toLowerCase();
return `hashtag:${normalizedHashtag}[#${hashtag}]`;
});
}

101
src/detector.ts

@ -4,55 +4,70 @@ import { ContentFormat } from './types'; @@ -4,55 +4,70 @@ import { ContentFormat } from './types';
* Detects the content format based on content patterns
*/
export function detectFormat(content: string): ContentFormat {
if (!content || content.trim().length === 0) {
return ContentFormat.Plain;
// Check for AsciiDoc indicators
const asciidocIndicators = [
'= ', // Title
'== ', // Section
'=== ', // Subsection
'include::', // Include directive
'image::', // Image block
'[source', // Source block
'----', // Listing block
'....', // Literal block
'|===', // Table
'link:', // AsciiDoc link format
'wikilink:', // Wikilink macro
'hashtag:', // Hashtag macro
];
let asciidocScore = 0;
for (const indicator of asciidocIndicators) {
if (content.includes(indicator)) {
asciidocScore++;
}
}
const trimmed = content.trim();
// Check for Wikipedia markup indicators (== Heading == format)
const wikipediaIndicators = [
/^==+\s+.+?\s+==+$/m, // Wikipedia headings: == Heading ==
/\[\[[^\]]+\]\]/, // Wikipedia links: [[Page]]
/''[^']+''/, // Wikipedia bold: ''text''
/'[^']+'/, // Wikipedia italic: 'text'
];
// Check for AsciiDoc indicators
// - Document title: = Title
// - Section headers: ==, ===, etc.
// - AsciiDoc attributes: :attribute: value
// - AsciiDoc blocks: [source,lang], [abc], [plantuml]
// - AsciiDoc macros: image::, video::, audio::, link:
if (
/^=+\s+/.test(trimmed) ||
/^:[\w-]+:/.test(trimmed) ||
/\[source,[\w-]+\]/.test(content) ||
/\[abc\]/.test(content) ||
/\[plantuml\]/.test(content) ||
/image::/.test(content) ||
/video::/.test(content) ||
/audio::/.test(content) ||
/link:/.test(content) ||
/\[cols=/.test(content) ||
/\|\|===/.test(content) ||
/footnote:\[/.test(content) ||
/\[highlight\]/.test(content) ||
/\[line-through\]/.test(content) ||
/\[quote\]/.test(content)
) {
return ContentFormat.AsciiDoc;
let wikipediaScore = 0;
for (const indicator of wikipediaIndicators) {
if (indicator.test(content)) {
wikipediaScore++;
}
}
// Check for Markdown indicators
// - YAML frontmatter: --- at start
// - Markdown headers: #, ##, etc.
// - Markdown code blocks: ```lang
// - Markdown links: [text](url)
// - Markdown images: ![alt](url)
if (
/^---\s*$/.test(trimmed.split('\n')[0]) ||
/^#{1,6}\s+/.test(trimmed) ||
/^```[\w-]*/.test(trimmed) ||
/\[.*?\]\(.*?\)/.test(content) ||
/!\[.*?\]\(.*?\)/.test(content) ||
/^\|\s*\|/.test(trimmed) ||
/^>\s+/.test(trimmed)
) {
// Check for Markdown indicators (more specific patterns to avoid false positives)
const markdownIndicators = [
/^#{1,6}\s+/m, // Heading at start of line
/```[\s\S]*?```/, // Code block
/\*\*[^*]+\*\*/, // Bold text
/^[-*+]\s+/m, // List item at start of line
/!\[[^\]]*\]\([^)]+\)/, // Image syntax
/\[[^\]]+\]\([^)]+\)/, // Link syntax
];
let markdownScore = 0;
for (const indicator of markdownIndicators) {
if (indicator.test(content)) {
markdownScore++;
}
}
// Determine format based on scores
// Wikipedia format takes precedence if detected (it's more specific)
if (wikipediaScore > 0 && wikipediaScore >= 2) {
return ContentFormat.Wikipedia;
} else if (asciidocScore > markdownScore && asciidocScore >= 2) {
return ContentFormat.AsciiDoc;
} else if (markdownScore > 0) {
return ContentFormat.Markdown;
}
return ContentFormat.Unknown;
return ContentFormat.Plain;
}

274
src/extractors/metadata.ts

@ -0,0 +1,274 @@ @@ -0,0 +1,274 @@
import { NostrLink, Wikilink } from '../types';
export interface ExtractedMetadata {
nostrLinks: NostrLink[];
wikilinks: Wikilink[];
hashtags: string[];
links: Array<{ url: string; text: string; isExternal: boolean }>;
media: string[];
}
/**
* Extracts metadata from content before processing
*/
export function extractMetadata(content: string, linkBaseURL: string): ExtractedMetadata {
return {
nostrLinks: extractNostrLinks(content),
wikilinks: extractWikilinks(content),
hashtags: extractHashtags(content),
links: extractLinks(content, linkBaseURL),
media: extractMedia(content),
};
}
/**
* Extract Nostr links from content
*/
function extractNostrLinks(content: string): NostrLink[] {
const nostrLinks: NostrLink[] = [];
const seen = new Set<string>();
// Extract nostr: prefixed links (valid bech32 format)
const nostrMatches = content.match(/nostr:((?:npub|nprofile|nevent|naddr|note)1[a-z0-9]{6,})/gi) || [];
nostrMatches.forEach(match => {
const id = match.substring(6); // Remove 'nostr:'
const type = getNostrType(id);
if (type && !seen.has(id)) {
seen.add(id);
nostrLinks.push({
type,
id,
text: match,
bech32: id,
});
}
});
return nostrLinks;
}
/**
* Extract wikilinks from content
*/
function extractWikilinks(content: string): Wikilink[] {
const wikilinks: Wikilink[] = [];
const seen = new Set<string>();
// Match [[target]] or [[target|display]]
const wikilinkPattern = /\[\[([^|\]]+)(?:\|([^\]]+))?\]\]/g;
let match;
while ((match = wikilinkPattern.exec(content)) !== null) {
const target = match[1].trim();
const display = match[2] ? match[2].trim() : target;
const dtag = normalizeDtag(target);
const key = `${dtag}|${display}`;
if (!seen.has(key)) {
seen.add(key);
wikilinks.push({
dtag,
display,
original: match[0],
});
}
}
return wikilinks;
}
/**
* Extract hashtags from content
* Excludes hashtags in URLs, code blocks, and inline code
*/
function extractHashtags(content: string): string[] {
const hashtags: string[] = [];
const seen = new Set<string>();
// Remove code blocks first to avoid matching inside them
const codeBlockPattern = /```[\s\S]*?```/g;
const inlineCodePattern = /`[^`]+`/g;
const urlPattern = /https?:\/\/[^\s<>"']+/g;
let processedContent = content
.replace(codeBlockPattern, '') // Remove code blocks
.replace(inlineCodePattern, '') // Remove inline code
.replace(urlPattern, ''); // Remove URLs
// Extract hashtags: #hashtag (word boundary to avoid matching in URLs)
const hashtagPattern = /\B#([a-zA-Z0-9_]+)/g;
let match;
while ((match = hashtagPattern.exec(processedContent)) !== null) {
const tag = match[1].toLowerCase();
if (!seen.has(tag)) {
hashtags.push(tag);
seen.add(tag);
}
}
return hashtags;
}
/**
* Extract regular links from content
*/
function extractLinks(content: string, linkBaseURL: string): Array<{ url: string; text: string; isExternal: boolean }> {
const links: Array<{ url: string; text: string; isExternal: boolean }> = [];
const seen = new Set<string>();
// Extract markdown links: [text](url) - optimized to avoid double matching
const markdownLinkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
let markdownMatch;
while ((markdownMatch = markdownLinkPattern.exec(content)) !== null) {
const [, text, url] = markdownMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract asciidoc links: link:url[text] - optimized to avoid double matching
const asciidocLinkPattern = /link:([^\[]+)\[([^\]]+)\]/g;
let asciidocMatch;
while ((asciidocMatch = asciidocLinkPattern.exec(content)) !== null) {
const [, url, text] = asciidocMatch;
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
}
// Extract raw URLs (basic pattern)
const urlPattern = /https?:\/\/[^\s<>"']+/g;
const rawUrls = content.match(urlPattern) || [];
rawUrls.forEach(url => {
if (!seen.has(url) && !isNostrUrl(url)) {
seen.add(url);
links.push({
url,
text: url,
isExternal: isExternalUrl(url, linkBaseURL),
});
}
});
return links;
}
/**
* Extract media URLs from content
*/
function extractMedia(content: string): string[] {
const media: string[] = [];
const seen = new Set<string>();
// Extract markdown images: ![alt](url) - optimized to avoid double matching
const markdownImagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
let markdownImageMatch;
while ((markdownImageMatch = markdownImagePattern.exec(content)) !== null) {
const url = markdownImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract asciidoc images: image::url[alt] - optimized to avoid double matching
const asciidocImagePattern = /image::([^\[]+)\[/g;
let asciidocImageMatch;
while ((asciidocImageMatch = asciidocImagePattern.exec(content)) !== null) {
const url = asciidocImageMatch[1];
if (url && !seen.has(url)) {
if (isImageUrl(url) || isVideoUrl(url)) {
media.push(url);
seen.add(url);
}
}
}
// Extract raw image/video URLs
const urlPattern = /https?:\/\/[^\s<>"']+/g;
const rawUrls = content.match(urlPattern) || [];
rawUrls.forEach(url => {
if (!seen.has(url) && (isImageUrl(url) || isVideoUrl(url))) {
media.push(url);
seen.add(url);
}
});
return media;
}
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
/**
* Normalize text to d-tag format
*/
function normalizeDtag(text: string): string {
return text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
}
/**
* Check if URL is external
*/
function isExternalUrl(url: string, linkBaseURL: string): boolean {
if (!linkBaseURL) return true;
try {
// Use a simple string-based check for Node.js compatibility
// Extract hostname from URL string
const urlMatch = url.match(/^https?:\/\/([^\/]+)/);
const baseMatch = linkBaseURL.match(/^https?:\/\/([^\/]+)/);
if (urlMatch && baseMatch) {
return urlMatch[1] !== baseMatch[1];
}
return true;
} catch {
return true;
}
}
/**
* Check if URL is a Nostr URL
*/
function isNostrUrl(url: string): boolean {
return url.startsWith('nostr:') || getNostrType(url) !== null;
}
/**
* Check if URL is an image
*/
function isImageUrl(url: string): boolean {
return /\.(jpeg|jpg|png|gif|webp|svg)$/i.test(url);
}
/**
* Check if URL is a video
*/
function isVideoUrl(url: string): boolean {
return /\.(mp4|webm|ogg)$/i.test(url);
}

220
src/parser.ts

@ -1,127 +1,89 @@ @@ -1,127 +1,89 @@
import { ParserOptions, ProcessResult, ContentFormat, Wikilink } from './types';
import { ParserOptions, ProcessResult, ContentFormat } from './types';
import { detectFormat } from './detector';
import { processAsciiDoc } from './processors/asciidoc';
import { processMarkdown } from './processors/markdown';
import { postProcess } from './post-processor';
import { preProcessAsciiDoc, restorePlaceholders } from './pre-processor';
import { convertToAsciidoc } from './converters/to-asciidoc';
import { processAsciidoc } from './processors/asciidoc';
import { extractMetadata } from './extractors/metadata';
/**
* Default parser options
*/
export function defaultOptions(): ParserOptions {
return {
linkBaseURL: undefined,
linkBaseURL: '',
enableAsciiDoc: true,
enableMarkdown: true,
enableCodeHighlighting: true,
enableLaTeX: true,
enableMusicalNotation: true,
enableNostrAddresses: true,
wikilinkUrl: undefined,
hashtagUrl: undefined
};
}
/**
* Main parser for Nostr event content
* Handles multiple content formats: AsciiDoc, Markdown
* Post-processes wikilinks, hashtags, and nostr: addresses
* Handles multiple content formats: AsciiDoc, Markdown, code syntax,
* LaTeX, musical notation, and nostr: prefixed addresses
*
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
export class Parser {
private options: ParserOptions;
constructor(options?: ParserOptions) {
this.options = { ...defaultOptions(), ...options };
private options: Required<ParserOptions>;
constructor(options: ParserOptions = {}) {
const defaults = defaultOptions();
this.options = {
linkBaseURL: options.linkBaseURL ?? defaults.linkBaseURL ?? '',
enableAsciiDoc: options.enableAsciiDoc ?? defaults.enableAsciiDoc ?? true,
enableMarkdown: options.enableMarkdown ?? defaults.enableMarkdown ?? true,
enableCodeHighlighting: options.enableCodeHighlighting ?? defaults.enableCodeHighlighting ?? true,
enableLaTeX: options.enableLaTeX ?? defaults.enableLaTeX ?? true,
enableMusicalNotation: options.enableMusicalNotation ?? defaults.enableMusicalNotation ?? true,
enableNostrAddresses: options.enableNostrAddresses ?? defaults.enableNostrAddresses ?? true,
};
}
/**
* Process Nostr event content and return HTML
* Automatically detects the content format and processes accordingly
* Everything is converted to AsciiDoc first, then processed through AsciiDoctor
*/
async process(content: string): Promise<ProcessResult> {
if (!content || content.trim().length === 0) {
return this.emptyResult();
}
// Extract metadata from original content (before conversion)
const metadata = extractMetadata(content, this.options.linkBaseURL);
// Detect format
// Detect content format
const format = detectFormat(content);
// Process based on format
let html: string;
let tableOfContents = '';
let hasLaTeX = false;
let hasMusicalNotation = false;
let frontmatter: Record<string, any> | undefined;
let preProcessWikilinks: Wikilink[] = [];
let preProcessHashtags: string[] = [];
if (format === ContentFormat.AsciiDoc && this.options.enableAsciiDoc !== false) {
// Pre-process to handle wikilinks and hashtags before AsciiDoc conversion
const preProcessResult = preProcessAsciiDoc(content, this.options);
preProcessWikilinks = preProcessResult.wikilinks;
preProcessHashtags = preProcessResult.hashtags;
const result = processAsciiDoc(preProcessResult.content, this.options);
// Restore wikilinks and hashtags from placeholders
html = restorePlaceholders(result.html, preProcessResult.wikilinks, preProcessResult.hashtags, this.options);
tableOfContents = result.tableOfContents;
hasLaTeX = result.hasLaTeX;
hasMusicalNotation = result.hasMusicalNotation;
} else if (format === ContentFormat.Markdown && this.options.enableMarkdown !== false) {
const result = processMarkdown(content, this.options);
html = result.html;
frontmatter = result.frontmatter;
hasLaTeX = result.hasLaTeX;
hasMusicalNotation = result.hasMusicalNotation;
} else {
// Plain text or unknown format - just escape and wrap
html = `<p>${escapeHtml(content)}</p>`;
}
// Post-process for nostr: addresses and handle any remaining processing
// Note: wikilinks and hashtags are already processed for AsciiDoc
const postProcessResult = postProcess(html, this.options, format === ContentFormat.AsciiDoc);
// Extract additional metadata
const links = extractLinks(postProcessResult.html);
const media = extractMedia(postProcessResult.html);
// Merge pre-processed and post-processed wikilinks/hashtags
const allWikilinks = preProcessWikilinks.length > 0
? preProcessWikilinks
: postProcessResult.wikilinks;
const allHashtags = preProcessHashtags.length > 0
? preProcessHashtags
: postProcessResult.hashtags;
// Convert everything to AsciiDoc format first
const asciidocContent = convertToAsciidoc(
content,
format,
this.options.linkBaseURL,
{
enableNostrAddresses: this.options.enableNostrAddresses,
}
);
// Process through AsciiDoctor
const result = await processAsciidoc(
asciidocContent,
{
enableCodeHighlighting: this.options.enableCodeHighlighting,
enableLaTeX: this.options.enableLaTeX,
enableMusicalNotation: this.options.enableMusicalNotation,
originalContent: content, // Pass original for LaTeX detection
linkBaseURL: this.options.linkBaseURL, // Pass linkBaseURL for link processing
}
);
// Combine with extracted metadata
return {
content: postProcessResult.html,
tableOfContents,
hasLaTeX,
hasMusicalNotation,
frontmatter,
nostrLinks: postProcessResult.nostrLinks,
wikilinks: allWikilinks,
hashtags: allHashtags,
links,
media
};
}
private emptyResult(): ProcessResult {
return {
content: '',
tableOfContents: '',
hasLaTeX: false,
hasMusicalNotation: false,
nostrLinks: [],
wikilinks: [],
hashtags: [],
links: [],
media: []
...result,
nostrLinks: metadata.nostrLinks,
wikilinks: metadata.wikilinks,
hashtags: metadata.hashtags,
links: metadata.links,
media: metadata.media,
};
}
}
@ -133,75 +95,3 @@ export async function process(content: string, options?: ParserOptions): Promise @@ -133,75 +95,3 @@ export async function process(content: string, options?: ParserOptions): Promise
const parser = new Parser(options);
return parser.process(content);
}
/**
* Extract regular links from HTML
*/
function extractLinks(html: string): Array<{ url: string; text: string; isExternal: boolean }> {
const links: Array<{ url: string; text: string; isExternal: boolean }> = [];
const linkRegex = /<a[^>]+href=["']([^"']+)["'][^>]*>([^<]*)<\/a>/gi;
let match;
while ((match = linkRegex.exec(html)) !== null) {
const url = match[1];
const text = match[2] || url;
const isExternal = url.startsWith('http://') || url.startsWith('https://') || url.startsWith('//');
// Skip nostr links, wikilinks, and hashtags (already extracted)
if (url.includes('nostr-') || url.includes('events?d=') || url.includes('data-topic')) {
continue;
}
links.push({ url, text, isExternal });
}
return links;
}
/**
* Extract media URLs from HTML
*/
function extractMedia(html: string): string[] {
const media: string[] = [];
// Extract image sources
const imgRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
let match;
while ((match = imgRegex.exec(html)) !== null) {
media.push(match[1]);
}
// Extract video sources
const videoRegex = /<video[^>]+src=["']([^"']+)["'][^>]*>/gi;
while ((match = videoRegex.exec(html)) !== null) {
media.push(match[1]);
}
// Extract audio sources
const audioRegex = /<audio[^>]+src=["']([^"']+)["'][^>]*>/gi;
while ((match = audioRegex.exec(html)) !== null) {
media.push(match[1]);
}
// Extract source tags
const sourceRegex = /<source[^>]+src=["']([^"']+)["'][^>]*>/gi;
while ((match = sourceRegex.exec(html)) !== null) {
media.push(match[1]);
}
return media;
}
/**
* Escape HTML special characters
*/
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;'
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

562
src/post-processor.ts

@ -1,562 +0,0 @@ @@ -1,562 +0,0 @@
import { ParserOptions, NostrLink, Wikilink } from './types';
/**
* Extract and process wikilinks, hashtags, and nostr: addresses from HTML
*/
export interface PostProcessResult {
html: string;
nostrLinks: NostrLink[];
wikilinks: Wikilink[];
hashtags: string[];
}
/**
* Post-process HTML to convert wikilinks, hashtags, and nostr: addresses
* @param skipWikilinksAndHashtags - If true, skip processing wikilinks and hashtags (already processed)
*/
export function postProcess(html: string, options: ParserOptions, skipWikilinksAndHashtags: boolean = false): PostProcessResult {
let processed = html;
const nostrLinks: NostrLink[] = [];
const wikilinks: Wikilink[] = [];
const hashtags: string[] = [];
// First, mark code blocks to avoid processing inside them
const codeBlockMarkers: Array<{ start: number; end: number }> = [];
const codeBlockRegex = /<(pre|code)[^>]*>[\s\S]*?<\/\1>/gi;
let match;
while ((match = codeBlockRegex.exec(html)) !== null) {
codeBlockMarkers.push({ start: match.index, end: match.index + match[0].length });
}
function isInCodeBlock(index: number): boolean {
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end);
}
// Process nostr: addresses (but not in code blocks)
if (options.enableNostrAddresses !== false) {
const nostrRegex = /nostr:([np][a-z0-9]+1[a-z0-9]+)/gi;
const replacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = nostrRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
const bech32 = match[1];
const type = getNostrType(bech32);
if (!type) continue;
const link: NostrLink = {
type,
id: bech32,
text: match[0],
bech32: bech32
};
nostrLinks.push(link);
const url = options.linkBaseURL
? `${options.linkBaseURL}/nostr/${bech32}`
: `#nostr-${bech32}`;
replacements.push({
match: match[0],
replacement: `<a href="${escapeHtml(url)}" class="nostr-link" data-nostr-type="${type}" data-nostr-id="${escapeHtml(bech32)}">${escapeHtml(match[0])}</a>`,
index: match.index
});
}
// Apply replacements in reverse order to preserve indices
replacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
}
// Process wikilinks: [[dtag]] or [[dtag|display]] (but not in code blocks)
// Skip if already processed (for AsciiDoc)
if (!skipWikilinksAndHashtags) {
const wikilinkRegex = /\[\[([^\]]+)\]\]/g;
const wikilinkReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = wikilinkRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Skip if already inside a link tag
const beforeMatch = processed.substring(0, match.index);
const lastOpenTag = beforeMatch.lastIndexOf('<a');
const lastCloseTag = beforeMatch.lastIndexOf('</a>');
if (lastOpenTag > lastCloseTag) continue; // Inside a link
const content = match[1];
const parts = content.split('|');
const dtag = parts[0].trim();
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag;
const wikilink: Wikilink = {
dtag,
display,
original: match[0]
};
wikilinks.push(wikilink);
let url: string;
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(dtag);
} else if (typeof options.wikilinkUrl === 'string') {
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(dtag));
} else {
url = options.linkBaseURL
? `${options.linkBaseURL}/events?d=${encodeURIComponent(dtag)}`
: `#${encodeURIComponent(dtag)}`;
}
wikilinkReplacements.push({
match: match[0],
replacement: `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(dtag)}">${escapeHtml(display)}</a>`,
index: match.index
});
}
// Apply wikilink replacements in reverse order
wikilinkReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
// Process hashtags: #hashtag (but not in code blocks or inside HTML tags)
// Match hashtag at start of string, after whitespace, after >, or immediately after opening tags
const hashtagRegex = /(#[\w-]+)/g;
const hashtagReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = hashtagRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Check if we're inside an HTML tag
const beforeMatch = processed.substring(0, match.index);
const lastOpenTag = beforeMatch.lastIndexOf('<');
const lastCloseTag = beforeMatch.lastIndexOf('>');
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
// Skip if already inside a link or span
const lastLinkOpen = beforeMatch.lastIndexOf('<a');
const lastLinkClose = beforeMatch.lastIndexOf('</a>');
const lastSpanOpen = beforeMatch.lastIndexOf('<span');
const lastSpanClose = beforeMatch.lastIndexOf('</span>');
if (lastLinkOpen > lastLinkClose || lastSpanOpen > lastSpanClose) continue;
// Check what's before the hashtag
const charBefore = match.index > 0 ? processed[match.index - 1] : '';
const beforeHashtag = processed.substring(Math.max(0, match.index - 100), match.index);
const lastTagClose = beforeHashtag.lastIndexOf('>');
const textAfterTag = beforeHashtag.substring(lastTagClose + 1);
// Hashtag is valid if:
// 1. At start of string
// 2. Preceded by whitespace
// 3. Preceded by >
// 4. Immediately after opening tag (like <p>#hashtag)
const isValidPosition =
match.index === 0 ||
/\s/.test(charBefore) ||
charBefore === '>' ||
(lastTagClose >= 0 && /^[\s\n]*$/.test(textAfterTag));
if (!isValidPosition) continue;
const hashtag = match[1];
const topic = hashtag.substring(1);
const prefix = (match.index === 0 || charBefore === '>' || (lastTagClose >= 0 && /^[\s\n]*$/.test(textAfterTag)))
? ''
: charBefore;
if (!hashtags.includes(topic)) {
hashtags.push(topic);
}
let url: string | undefined;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(topic);
} else if (typeof options.hashtagUrl === 'string') {
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic));
}
const replacement = url
? `${prefix}<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>`
: `${prefix}<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`;
hashtagReplacements.push({
match: match[0],
replacement,
index: match.index
});
}
// Apply hashtag replacements in reverse order
hashtagReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
}
// Extract wikilinks and hashtags from already-processed HTML (for AsciiDoc)
if (skipWikilinksAndHashtags) {
// Extract wikilinks from existing links
const wikilinkLinkRegex = /<a[^>]+class="wikilink"[^>]+data-dtag="([^"]+)"[^>]*>([^<]+)<\/a>/g;
while ((match = wikilinkLinkRegex.exec(processed)) !== null) {
wikilinks.push({
dtag: match[1],
display: match[2],
original: match[0]
});
}
// Extract hashtags from existing spans/links
const hashtagRegex = /<(?:a|span)[^>]+class="hashtag"[^>]+data-topic="([^"]+)"[^>]*>#\1<\/\w+>/g;
while ((match = hashtagRegex.exec(processed)) !== null) {
const topic = match[1];
if (!hashtags.includes(topic)) {
hashtags.push(topic);
}
}
}
// Remove links inside code blocks (both <code> and <pre> tags)
// This ensures URLs in code blocks remain as plain text
const codeBlockLinkRegex = /(<(?:code|pre)[^>]*>)([\s\S]*?)(<\/(?:code|pre)>)/gi;
processed = processed.replace(codeBlockLinkRegex, (match, openTag, content, closeTag) => {
// Remove all <a> tags inside code blocks, keeping only the text content
const cleanedContent = content.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
return openTag + cleanedContent + closeTag;
});
// Process YouTube URLs - ORDER IS CRITICAL to avoid double-parsing
// 1. FIRST: Fix video tags that contain YouTube URLs (before they get processed as bare URLs)
// AsciiDoc's video:: macro creates <video> tags, but YouTube URLs should be iframes
const youtubeVideoTagRegex = /<video[^>]+src="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>[\s\S]*?<\/video>/gi;
processed = processed.replace(youtubeVideoTagRegex, (match, url, videoId) => {
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allow="encrypted-media; fullscreen; picture-in-picture; web-share" referrerpolicy='strict-origin-when-cross-origin' width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 2. SECOND: Process YouTube links in <a> tags
// IMPORTANT: Be very specific with YouTube regex to avoid matching Spotify URLs
const youtubeLinkRegex = /<a[^>]+href="(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+))"[^>]*>.*?<\/a>/gi;
processed = processed.replace(youtubeLinkRegex, (match, url, videoId) => {
if (isInCodeBlock(processed.indexOf(match))) return match;
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allow="encrypted-media; fullscreen; picture-in-picture; web-share" referrerpolicy='strict-origin-when-cross-origin' width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 3. THIRD: Fix malformed YouTube iframes from AsciiDoc video:: macro
// AsciiDoc sometimes creates iframes with malformed YouTube URLs (watch?v= or shorts/ instead of embed/)
// Match the entire iframe element including closing tag to avoid duplicates
const malformedYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube[^"]*(?:watch\?v=|shorts\/)([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi;
processed = processed.replace(malformedYoutubeIframeRegex, (match, videoId) => {
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allow="encrypted-media; fullscreen; picture-in-picture; web-share" referrerpolicy='strict-origin-when-cross-origin' width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 3.5: Fix YouTube iframes with embed URLs but wrong parameters or missing required attributes
// AsciiDoc's video:: macro creates iframes with ?rel=0 or missing allow/referrerpolicy attributes
// Match iframes with embed URLs that don't have enablejsapi=1 or are missing required attributes
const incompleteYoutubeIframeRegex = /<iframe[^>]+src="https?:\/\/(?:www\.)?youtube\.com\/embed\/([a-zA-Z0-9_-]+)(\?[^"]*)?"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi;
processed = processed.replace(incompleteYoutubeIframeRegex, (match, videoId, params) => {
// Check if this iframe already has the correct format (has enablejsapi=1 and required attributes)
if (match.includes('enablejsapi=1') &&
match.includes('allow=') &&
match.includes('referrerpolicy=') &&
match.includes('class="youtube-embed"')) {
return match; // Already correct, don't modify
}
// Fix the iframe with proper attributes
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allow="encrypted-media; fullscreen; picture-in-picture; web-share" referrerpolicy='strict-origin-when-cross-origin' width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 4. FOURTH: Fix any existing YouTube iframes that have malformed embed URLs (AsciiDoc sometimes creates broken embed URLs)
// Match the entire iframe element including closing tag to avoid duplicates
const brokenYoutubeIframeRegex = /<iframe[^>]+src="[^"]*youtube\.com\/embed\/[^"]*watch\?v=([a-zA-Z0-9_-]+)[^"]*"[^>]*(?:\/>|>[\s\S]*?<\/iframe>)/gi;
processed = processed.replace(brokenYoutubeIframeRegex, (match, videoId) => {
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allow="encrypted-media; fullscreen; picture-in-picture; web-share" referrerpolicy='strict-origin-when-cross-origin' width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
});
// 5. LAST: Handle bare YouTube URLs (not in links, video tags, or iframes)
// IMPORTANT: Match must be specific to youtube.com or youtu.be to avoid matching Spotify
// This must come AFTER processing video tags and links to avoid double-parsing
const bareYoutubeRegex = /(https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)(?:\?[^"\s<>]*)?)/gi;
const youtubeReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = bareYoutubeRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Check if it's already in a tag (link, iframe, video, etc.)
// Simple approach: check if we're inside quotes (attribute value) or between <tag and >
const before = processed.substring(Math.max(0, match.index - 500), match.index);
const after = processed.substring(match.index, match.index + match[0].length + 100);
// Check if URL is inside quotes (attribute value like src="..." or href="...")
const beforeContext = before.substring(Math.max(0, before.length - 100));
if (beforeContext.match(/<(iframe|video|a|img|audio|source)[^>]*\s+(src|href)="[^"]*$/i)) {
continue; // Inside an attribute value, skip
}
// Check if we're between an opening tag and its closing bracket
const lastOpenTag = before.lastIndexOf('<');
const lastCloseBracket = before.lastIndexOf('>');
if (lastOpenTag > lastCloseBracket) {
// We're inside a tag, check what kind
const tagContent = before.substring(lastOpenTag);
if (/<(iframe|video|a|img|audio|source)[^>]*$/i.test(tagContent)) {
continue; // Skip URLs inside these tags
}
}
const videoId = match[2];
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
youtubeReplacements.push({
match: match[0],
replacement: `<iframe class="youtube-embed" frameborder="0" allowfullscreen allow="accelerometer; autoplay; clipboard-write; encrypted-media; fullscreen; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`,
index: match.index
});
}
youtubeReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
// Fix double-closed iframes (safety net)
processed = processed.replace(/<\/iframe><\/iframe>/gi, '</iframe>');
// Spotify: https://open.spotify.com/episode/ID or https://open.spotify.com/track/ID or https://open.spotify.com/album/ID
const spotifyLinkRegex = /<a[^>]+href="(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+))[^"]*"[^>]*>.*?<\/a>/gi;
processed = processed.replace(spotifyLinkRegex, (match, url, type, id) => {
if (isInCodeBlock(processed.indexOf(match))) return match;
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`;
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`;
});
// Also handle bare Spotify URLs (not in links)
const bareSpotifyRegex = /(https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)(?:\?[^"\s<>]*)?)/gi;
const spotifyReplacements: Array<{ match: string; replacement: string; index: number }> = [];
while ((match = bareSpotifyRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
// Check if it's already in a tag
const before = processed.substring(0, match.index);
const lastOpenTag = before.lastIndexOf('<');
const lastCloseTag = before.lastIndexOf('>');
if (lastOpenTag > lastCloseTag) continue; // Inside a tag
const type = match[2];
const id = match[3];
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`;
spotifyReplacements.push({
match: match[0],
replacement: `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`,
index: match.index
});
}
spotifyReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
// Process bare image/media URLs that aren't already in tags
// First, convert bare links (class="bare") that contain image/video/audio URLs to actual media elements
// This handles cases where AsciiDoc has already converted URLs to links
// IMPORTANT: Check YouTube FIRST, then Spotify, BEFORE checking file extensions to avoid conflicts
const bareLinkRegex = /<a[^>]+href="(https?:\/\/[^"]+)"[^>]*class="[^"]*bare[^"]*"[^>]*>([^<]*)<\/a>/gi;
processed = processed.replace(bareLinkRegex, (match, url, linkText) => {
if (isInCodeBlock(processed.indexOf(match))) return match;
// Check YouTube URLs FIRST (be very specific - must be youtube.com or youtu.be)
// This prevents accidentally matching Spotify URLs
const youtubeMatch = url.match(/https?:\/\/(?:www\.)?(?:youtube\.com\/(?:watch\?v=|shorts\/)|youtu\.be\/)([a-zA-Z0-9_-]+)/);
if (youtubeMatch) {
const videoId = youtubeMatch[1];
const embedUrl = `https://www.youtube.com/embed/${videoId}?enablejsapi=1`;
return `<iframe class="youtube-embed" frameborder="0" allow="encrypted-media; fullscreen; picture-in-picture; web-share" referrerpolicy='strict-origin-when-cross-origin' width="100%" height="360" src="${escapeHtml(embedUrl)}"></iframe>`;
}
// Check Spotify URLs (be very specific - must be open.spotify.com)
const spotifyMatch = url.match(/https?:\/\/open\.spotify\.com\/(episode|track|album|playlist)\/([a-zA-Z0-9]+)/);
if (spotifyMatch) {
const type = spotifyMatch[1];
const id = spotifyMatch[2];
const embedUrl = `https://open.spotify.com/embed/${type}/${id}`;
return `<iframe src="${escapeHtml(embedUrl)}" width="100%" height="352" frameborder="0" allowtransparency="true" allow="encrypted-media" class="spotify-embed"></iframe>`;
}
// Check if it's an image URL
if (/\.(jpg|jpeg|png|gif|webp|svg|bmp)(\?|$)/i.test(url)) {
return `<img src="${escapeHtml(url)}" alt="${escapeHtml(linkText)}" class="bare-image" />`;
}
// Check if it's a video URL (but not YouTube)
if (/\.(mp4|webm|ogg|mov|avi)(\?|$)/i.test(url)) {
return `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`;
}
// Check if it's an audio URL (but not Spotify)
if (/\.(mp3|wav|ogg|flac|aac|m4a)(\?|$)/i.test(url)) {
return `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`;
}
// Not a media URL, return as-is
return match;
});
// Now process bare URLs that aren't in any tags at all
// IMPORTANT: Skip YouTube and Spotify URLs - they're already processed above
const imageUrlRegex = /(https?:\/\/[^\s<>"']+\.(jpg|jpeg|png|gif|webp|svg|bmp))(?![^<]*>)/gi;
const videoUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp4|webm|ogg|mov|avi))(?![^<]*>)/gi;
const audioUrlRegex = /(https?:\/\/[^\s<>"']+\.(mp3|wav|ogg|flac|aac|m4a))(?![^<]*>)/gi;
// Check if URL is already in a tag
function isUrlInTag(url: string, index: number): boolean {
const before = processed.substring(0, index);
const after = processed.substring(index);
// Check if it's inside an existing tag
const lastOpenTag = before.lastIndexOf('<');
const lastCloseTag = before.lastIndexOf('>');
if (lastOpenTag > lastCloseTag) {
const tagContent = processed.substring(lastOpenTag, index + url.length);
if (/<(img|video|audio|a|source|iframe)[^>]*>/i.test(tagContent)) {
return true;
}
}
return false;
}
const mediaReplacements: Array<{ match: string; replacement: string; index: number }> = [];
// Process images
while ((match = imageUrlRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
if (isUrlInTag(match[0], match.index)) continue;
const url = match[0];
mediaReplacements.push({
match: url,
replacement: `<img src="${escapeHtml(url)}" alt="" class="bare-image" />`,
index: match.index
});
}
// Process videos (but skip YouTube URLs - they're handled above)
while ((match = videoUrlRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
if (isUrlInTag(match[0], match.index)) continue;
// Skip YouTube URLs - they should be embeds, not video tags
if (/youtube\.com|youtu\.be/i.test(match[0])) continue;
const url = match[0];
mediaReplacements.push({
match: url,
replacement: `<video src="${escapeHtml(url)}" controls class="bare-video"></video>`,
index: match.index
});
}
// Process audio
while ((match = audioUrlRegex.exec(processed)) !== null) {
if (isInCodeBlock(match.index)) continue;
if (isUrlInTag(match[0], match.index)) continue;
const url = match[0];
mediaReplacements.push({
match: url,
replacement: `<audio src="${escapeHtml(url)}" controls class="bare-audio"></audio>`,
index: match.index
});
}
// Apply media replacements in reverse order
mediaReplacements.reverse().forEach(({ match, replacement, index }) => {
processed = processed.substring(0, index) + replacement + processed.substring(index + match.length);
});
// Process markdown table alignment
// Marked generates tables with align attributes or style attributes, we need to add CSS classes for styling
// Match tables and process alignment on th/td elements
const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi;
processed = processed.replace(tableRegex, (tableMatch: string, tableContent: string) => {
// Process each row
let processedTable = tableContent;
// Find all th and td elements - check for align attribute or style with text-align
const cellRegex = /<(th|td)([^>]*)>([\s\S]*?)<\/\1>/gi;
processedTable = processedTable.replace(cellRegex, (cellMatch: string, tag: string, attrs: string, content: string) => {
let align: string | null = null;
let newAttrs = attrs;
// Check for align attribute
const alignMatch = attrs.match(/align=["'](left|center|right)["']/i);
if (alignMatch) {
align = alignMatch[1].toLowerCase();
newAttrs = newAttrs.replace(/\s*align=["'](left|center|right)["']/i, '');
} else {
// Check for style attribute with text-align
const styleMatch = attrs.match(/style=["']([^"']*text-align:\s*(left|center|right)[^"']*)["']/i);
if (styleMatch) {
align = styleMatch[2].toLowerCase();
// Remove text-align from style
const styleContent = styleMatch[1].replace(/text-align:\s*(left|center|right);?/gi, '').trim();
if (styleContent) {
newAttrs = newAttrs.replace(/style=["'][^"']+["']/, `style="${styleContent}"`);
} else {
newAttrs = newAttrs.replace(/\s*style=["'][^"']+["']/, '');
}
}
}
// If we found alignment, add CSS class
if (align) {
const alignClass = align === 'left' ? 'halign-left' :
align === 'center' ? 'halign-center' : 'halign-right';
// If there's already a class attribute, merge them
if (newAttrs.includes('class=')) {
const classMatch = newAttrs.match(/class=["']([^"']+)["']/);
if (classMatch) {
const existingClass = classMatch[1];
if (!existingClass.includes(alignClass)) {
newAttrs = newAttrs.replace(/class=["'][^"']+["']/, `class="${existingClass} ${alignClass}"`);
}
}
} else {
newAttrs = `${newAttrs} class="${alignClass}"`.trim();
}
}
return `<${tag}${newAttrs}>${content}</${tag}>`;
});
return `<table>${processedTable}</table>`;
});
return {
html: processed,
nostrLinks,
wikilinks,
hashtags
};
}
/**
* Get Nostr identifier type from bech32 string
*/
function getNostrType(bech32: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (bech32.startsWith('npub')) return 'npub';
if (bech32.startsWith('nprofile')) return 'nprofile';
if (bech32.startsWith('nevent')) return 'nevent';
if (bech32.startsWith('naddr')) return 'naddr';
if (bech32.startsWith('note')) return 'note';
return null;
}
/**
* Escape HTML special characters
*/
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;'
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

175
src/pre-processor.ts

@ -1,175 +0,0 @@ @@ -1,175 +0,0 @@
import { ParserOptions, Wikilink } from './types';
import * as emoji from 'node-emoji';
/**
* Pre-process raw content to handle wikilinks and hashtags before AsciiDoc conversion
* This prevents AsciiDoc from converting them to anchors or other formats
*/
export interface PreProcessResult {
content: string;
wikilinks: Wikilink[];
hashtags: string[];
}
/**
* Pre-process content to convert wikilinks and hashtags to placeholders
* that will be processed after HTML conversion
*/
export function preProcessAsciiDoc(content: string, options: ParserOptions): PreProcessResult {
let processed = content;
const wikilinks: Wikilink[] = [];
const hashtags: string[] = [];
// Process emojis first
processed = emoji.emojify(processed);
// Process wikilinks: [[dtag]] or [[dtag|display]]
// Replace with a placeholder that AsciiDoc won't touch
const wikilinkRegex = /\[\[([^\]]+)\]\]/g;
const wikilinkPlaceholders: Map<string, Wikilink> = new Map();
let placeholderCounter = 0;
processed = processed.replace(wikilinkRegex, (match, content) => {
const parts = content.split('|');
const dtag = parts[0].trim();
const display = parts.length > 1 ? parts.slice(1).join('|').trim() : dtag;
const wikilink: Wikilink = {
dtag,
display,
original: match
};
wikilinks.push(wikilink);
// Use a unique placeholder that won't be processed by AsciiDoc
// Use angle brackets to avoid AsciiDoc formatting interpretation
const placeholder = `<WIKILINK_PLACEHOLDER_${placeholderCounter}>`;
wikilinkPlaceholders.set(placeholder, wikilink);
placeholderCounter++;
return placeholder;
});
// Process hashtags: #hashtag (but not in code blocks)
// Mark code blocks first
const codeBlockMarkers: Array<{ start: number; end: number }> = [];
const codeBlockRegex = /\[source,[^\]]+\]|\[abc\]|\[plantuml\]|```|`[^`]+`/g;
let match;
while ((match = codeBlockRegex.exec(processed)) !== null) {
// Find the end of the code block
const start = match.index;
let end = start + match[0].length;
// For source blocks, find the closing ----
if (match[0].startsWith('[source')) {
const afterStart = processed.substring(end);
const closeMatch = afterStart.match(/^[\s\S]*?----/);
if (closeMatch) {
end = start + match[0].length + closeMatch[0].length;
}
}
codeBlockMarkers.push({ start, end });
}
function isInCodeBlock(index: number): boolean {
return codeBlockMarkers.some(marker => index >= marker.start && index < marker.end);
}
// Process hashtags
const hashtagPlaceholders: Map<string, string> = new Map();
let hashtagCounter = 0;
// Match hashtags at start of line, after whitespace, or after > (for blockquotes)
const hashtagRegex = /(^|\s|>)(#[\w-]+)/gm;
processed = processed.replace(hashtagRegex, (match, prefix, hashtag, offset) => {
if (isInCodeBlock(offset)) return match;
const topic = hashtag.substring(1);
if (!hashtags.includes(topic)) {
hashtags.push(topic);
}
// Use angle brackets to avoid AsciiDoc formatting interpretation
const placeholder = `<HASHTAG_PLACEHOLDER_${hashtagCounter}>`;
hashtagPlaceholders.set(placeholder, topic);
hashtagCounter++;
return `${prefix}${placeholder}`;
});
return {
content: processed,
wikilinks,
hashtags
};
}
/**
* Restore wikilinks and hashtags from placeholders in HTML
*/
export function restorePlaceholders(
html: string,
wikilinks: Wikilink[],
hashtags: string[],
options: ParserOptions
): string {
let processed = html;
// Restore wikilinks (handle both escaped and unescaped placeholders)
const wikilinkPlaceholderRegex = /&lt;WIKILINK_PLACEHOLDER_(\d+)&gt;|<WIKILINK_PLACEHOLDER_(\d+)>/g;
processed = processed.replace(wikilinkPlaceholderRegex, (match, escapedIndex, unescapedIndex) => {
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex);
const wikilink = wikilinks[index];
if (!wikilink) return match;
let url: string;
if (typeof options.wikilinkUrl === 'function') {
url = options.wikilinkUrl(wikilink.dtag);
} else if (typeof options.wikilinkUrl === 'string') {
url = options.wikilinkUrl.replace('{dtag}', encodeURIComponent(wikilink.dtag));
} else {
url = options.linkBaseURL
? `${options.linkBaseURL}/events?d=${encodeURIComponent(wikilink.dtag)}`
: `#${encodeURIComponent(wikilink.dtag)}`;
}
return `<a href="${escapeHtml(url)}" class="wikilink" data-dtag="${escapeHtml(wikilink.dtag)}">${escapeHtml(wikilink.display)}</a>`;
});
// Restore hashtags (handle both escaped and unescaped placeholders)
const hashtagPlaceholderRegex = /&lt;HASHTAG_PLACEHOLDER_(\d+)&gt;|<HASHTAG_PLACEHOLDER_(\d+)>/g;
processed = processed.replace(hashtagPlaceholderRegex, (match, escapedIndex, unescapedIndex) => {
const index = escapedIndex !== undefined ? parseInt(escapedIndex) : parseInt(unescapedIndex);
const topic = hashtags[index];
if (!topic) return match;
let url: string | undefined;
if (typeof options.hashtagUrl === 'function') {
url = options.hashtagUrl(topic);
} else if (typeof options.hashtagUrl === 'string') {
url = options.hashtagUrl.replace('{topic}', encodeURIComponent(topic));
}
const hashtag = `#${topic}`;
if (url) {
return `<a href="${escapeHtml(url)}" class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</a>`;
} else {
return `<span class="hashtag" data-topic="${escapeHtml(topic)}">${escapeHtml(hashtag)}</span>`;
}
});
return processed;
}
function escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#039;'
};
return text.replace(/[&<>"']/g, (m) => map[m]);
}

216
src/processors/asciidoc.ts

@ -1,56 +1,174 @@ @@ -1,56 +1,174 @@
import asciidoctor from '@asciidoctor/core';
import { ParserOptions } from '../types';
import * as emoji from 'node-emoji';
export interface AsciiDocResult {
html: string;
tableOfContents: string;
hasLaTeX: boolean;
hasMusicalNotation: boolean;
import { ProcessResult } from '../types';
import { extractTOC, sanitizeHTML, processLinks } from './html-utils';
import { postProcessHtml } from './html-postprocess';
const asciidoctorInstance = asciidoctor();
export interface ProcessOptions {
enableCodeHighlighting?: boolean;
enableLaTeX?: boolean;
enableMusicalNotation?: boolean;
originalContent?: string; // Original content for LaTeX detection
linkBaseURL?: string; // Base URL for link processing
}
/**
* Process AsciiDoc content to HTML
* Processes AsciiDoc content to HTML using AsciiDoctor
* Uses AsciiDoctor's built-in highlight.js and LaTeX support
*/
export function processAsciiDoc(content: string, options: ParserOptions): AsciiDocResult {
const hasLaTeX = /\[source,latex\]|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content);
const hasMusicalNotation = /\[abc\]|\[source,abc\]/i.test(content);
// Process emojis before AsciiDoc conversion
const processedContent = emoji.emojify(content);
const asciidoctorOptions: any = {
safe: 'unsafe',
attributes: {
'showtitle': true,
'icons': 'font',
'source-highlighter': options.enableCodeHighlighting !== false ? 'highlight.js' : undefined,
'highlightjs-theme': 'github',
'toc': 'left',
'toclevels': 6,
'sectanchors': true,
'sectlinks': true,
'idprefix': '_',
'idseparator': '_'
export async function processAsciidoc(
content: string,
options: ProcessOptions = {}
): Promise<ProcessResult> {
const {
enableCodeHighlighting = true,
enableLaTeX = true,
enableMusicalNotation = true,
} = options;
// Check if content starts with level 3+ headers
// Asciidoctor article doctype requires level 1 (=) or level 2 (==) before level 3 (===)
// If content starts with level 3+, use book doctype
const firstHeaderMatch = content.match(/^(={1,6})\s+/m);
let doctype: 'article' | 'book' = 'article';
if (firstHeaderMatch) {
const firstHeaderLevel = firstHeaderMatch[1].length;
if (firstHeaderLevel >= 3) {
doctype = 'book';
}
}
try {
const result = asciidoctorInstance.convert(content, {
safe: 'safe',
backend: 'html5',
doctype: doctype,
attributes: {
'showtitle': true,
'sectanchors': true,
'sectlinks': true,
'toc': 'left',
'toclevels': 6,
'toc-title': 'Table of Contents',
'source-highlighter': enableCodeHighlighting ? 'highlight.js' : 'none',
'stem': enableLaTeX ? 'latexmath' : 'none',
'data-uri': true,
'imagesdir': '',
'linkcss': false,
'stylesheet': '',
'stylesdir': '',
'prewrap': true,
'sectnums': false,
'sectnumlevels': 6,
'experimental': true,
'compat-mode': false,
'attribute-missing': 'warn',
'attribute-undefined': 'warn',
'skip-front-matter': true,
'source-indent': 0,
'indent': 0,
'tabsize': 2,
'tabwidth': 2,
'hardbreaks': false,
'paragraph-rewrite': 'normal',
'sectids': true,
'idprefix': '',
'idseparator': '-',
'sectidprefix': '',
'sectidseparator': '-'
}
});
const htmlString = typeof result === 'string' ? result : result.toString();
// Extract table of contents from HTML
const { toc, contentWithoutTOC } = extractTOC(htmlString);
// Sanitize HTML to prevent XSS
const sanitized = sanitizeHTML(contentWithoutTOC);
// Post-process HTML: convert macros to HTML, add styling, etc.
const processed = postProcessHtml(sanitized, {
enableMusicalNotation,
linkBaseURL: options.linkBaseURL,
});
// Process links: add target="_blank" to external links
const processedWithLinks = options.linkBaseURL
? processLinks(processed, options.linkBaseURL)
: processed;
// Also process TOC
const tocSanitized = sanitizeHTML(toc);
const tocProcessed = postProcessHtml(tocSanitized, {
enableMusicalNotation: false, // Don't process music in TOC
linkBaseURL: options.linkBaseURL,
});
// Process links in TOC as well
const tocProcessedWithLinks = options.linkBaseURL
? processLinks(tocProcessed, options.linkBaseURL)
: tocProcessed;
// Check for LaTeX in original content (more reliable than checking HTML)
const contentToCheck = options.originalContent || content;
const hasLaTeX = enableLaTeX && hasMathContent(contentToCheck);
// Check for musical notation in processed HTML
const hasMusicalNotation = enableMusicalNotation && (
/class="abc-notation"|class="lilypond-notation"|class="chord"|class="musicxml-notation"/.test(processed)
);
return {
content: processedWithLinks,
tableOfContents: tocProcessedWithLinks,
hasLaTeX,
hasMusicalNotation,
nostrLinks: [], // Will be populated by metadata extraction
wikilinks: [],
hashtags: [],
links: [],
media: [],
};
} catch (error) {
// Fallback to plain text with error logging
const errorMessage = error instanceof Error ? error.message : String(error);
// Use process.stderr.write for Node.js compatibility instead of console.error
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const nodeProcess = (globalThis as any).process;
if (nodeProcess?.stderr) {
nodeProcess.stderr.write(`Error processing AsciiDoc: ${errorMessage}\n`);
}
};
// Convert to HTML
const Asciidoctor = asciidoctor();
const htmlResult = Asciidoctor.convert(processedContent, asciidoctorOptions);
const html = typeof htmlResult === 'string' ? htmlResult : htmlResult.toString();
// Extract table of contents if present
const tocMatch = html.match(/<div id="toc"[^>]*>([\s\S]*?)<\/div>/);
const tableOfContents = tocMatch ? tocMatch[1] : '';
// Remove TOC from main content if present
const contentWithoutToc = html.replace(/<div id="toc"[^>]*>[\s\S]*?<\/div>/, '');
return {
html: contentWithoutToc,
tableOfContents,
hasLaTeX,
hasMusicalNotation
};
// Escape HTML in content for safe display
const escapedContent = sanitizeHTML(content);
return {
content: `<p>${escapedContent}</p>`,
tableOfContents: '',
hasLaTeX: false,
hasMusicalNotation: false,
nostrLinks: [],
wikilinks: [],
hashtags: [],
links: [],
media: [],
};
}
}
/**
* Check if content has LaTeX math
* Based on jumble's detection pattern
*/
function hasMathContent(content: string): boolean {
// Check for inline math: $...$ or \(...\)
const inlineMath = /\$[^$]+\$|\\\([^)]+\\\)/.test(content);
// Check for block math: $$...$$ or \[...\]
const blockMath = /\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]/.test(content);
return inlineMath || blockMath;
}

212
src/processors/html-postprocess.ts

@ -0,0 +1,212 @@ @@ -0,0 +1,212 @@
import { processMusicalNotation } from './music';
export interface PostProcessOptions {
enableMusicalNotation?: boolean;
linkBaseURL?: string;
}
/**
* Post-processes HTML output from AsciiDoctor
* Converts AsciiDoc macros to HTML with data attributes and CSS classes
*/
export function postProcessHtml(html: string, options: PostProcessOptions = {}): string {
let processed = html;
// Convert bookstr markers to HTML placeholders
processed = processed.replace(/BOOKSTR:([^<>\s]+)/g, (_match, bookContent) => {
const escaped = bookContent.replace(/"/g, '&quot;').replace(/'/g, '&#39;');
return `<span data-bookstr="${escaped}" class="bookstr-placeholder"></span>`;
});
// Convert hashtag links to HTML
processed = processed.replace(/hashtag:([^[]+)\[([^\]]+)\]/g, (_match, normalizedHashtag, displayText) => {
// URL encode the hashtag to prevent XSS
const encodedHashtag = encodeURIComponent(normalizedHashtag);
// HTML escape the display text
const escapedDisplay = displayText
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
return `<a href="/notes?t=${encodedHashtag}" class="hashtag-link text-green-600 dark:text-green-400 hover:text-green-700 dark:hover:text-green-300 hover:underline">${escapedDisplay}</a>`;
});
// Convert WIKILINK:dtag|display placeholder format to HTML
// Match WIKILINK:dtag|display, ensuring we don't match across HTML tags
processed = processed.replace(/WIKILINK:([^|<>]+)\|([^<>\s]+)/g, (_match, dTag, displayText) => {
const escapedDtag = dTag.trim().replace(/"/g, '&quot;');
const escapedDisplay = displayText.trim()
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;');
// Always use relative URL for wikilinks (works on any domain)
const url = `/events?d=${escapedDtag}`;
return `<a class="wikilink text-primary-600 dark:text-primary-500 hover:underline" data-dtag="${escapedDtag}" data-url="${url}" href="${url}">${escapedDisplay}</a>`;
});
// Convert nostr: links to HTML
processed = processed.replace(/link:nostr:([^[]+)\[([^\]]+)\]/g, (_match, bech32Id, displayText) => {
const nostrType = getNostrType(bech32Id);
if (nostrType === 'nevent' || nostrType === 'naddr' || nostrType === 'note') {
// Render as embedded event placeholder
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<div data-embedded-note="${escaped}" class="embedded-note-container">Loading embedded event...</div>`;
} else if (nostrType === 'npub' || nostrType === 'nprofile') {
// Render as user handle
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<span class="user-handle" data-pubkey="${escaped}">@${displayText}</span>`;
} else {
// Fallback to regular link
const escaped = bech32Id.replace(/"/g, '&quot;');
return `<a href="nostr:${bech32Id}" class="nostr-link text-blue-600 hover:text-blue-800 hover:underline" data-nostr-type="${nostrType || 'unknown'}" data-bech32="${escaped}">${displayText}</a>`;
}
});
// Process images: add max-width styling and data attributes
processed = processImages(processed);
// Process musical notation if enabled
if (options.enableMusicalNotation) {
processed = processMusicalNotation(processed);
}
// Clean up any leftover markdown syntax
processed = cleanupMarkdown(processed);
// Add styling classes
processed = addStylingClasses(processed);
// Hide raw ToC text
processed = hideRawTocText(processed);
return processed;
}
/**
* Get Nostr identifier type
*/
function getNostrType(id: string): 'npub' | 'nprofile' | 'nevent' | 'naddr' | 'note' | null {
if (id.startsWith('npub')) return 'npub';
if (id.startsWith('nprofile')) return 'nprofile';
if (id.startsWith('nevent')) return 'nevent';
if (id.startsWith('naddr')) return 'naddr';
if (id.startsWith('note')) return 'note';
return null;
}
/**
* Process images: add max-width styling and data attributes
*/
function processImages(html: string): string {
const imageUrls: string[] = [];
const imageUrlRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/gi;
let match;
while ((match = imageUrlRegex.exec(html)) !== null) {
const url = match[1];
if (url && !imageUrls.includes(url)) {
imageUrls.push(url);
}
}
return html.replace(/<img([^>]+)>/gi, (imgTag, attributes) => {
const srcMatch = attributes.match(/src=["']([^"']+)["']/i);
if (!srcMatch) return imgTag;
const src = srcMatch[1];
const currentIndex = imageUrls.indexOf(src);
let updatedAttributes = attributes;
if (updatedAttributes.match(/class=["']/i)) {
updatedAttributes = updatedAttributes.replace(/class=["']([^"']*)["']/i, (_match: string, classes: string) => {
const cleanedClasses = classes.replace(/max-w-\[?[^\s\]]+\]?/g, '').trim();
const newClasses = cleanedClasses
? `${cleanedClasses} max-w-[400px] object-contain cursor-zoom-in`
: 'max-w-[400px] object-contain cursor-zoom-in';
return `class="${newClasses}"`;
});
} else {
updatedAttributes += ` class="max-w-[400px] h-auto object-contain cursor-zoom-in"`;
}
updatedAttributes += ` data-asciidoc-image="true" data-image-index="${currentIndex}" data-image-src="${src.replace(/"/g, '&quot;')}"`;
return `<img${updatedAttributes}>`;
});
}
/**
* Clean up leftover markdown syntax
*/
function cleanupMarkdown(html: string): string {
let cleaned = html;
// Clean up markdown image syntax
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, alt, url) => {
const altText = alt || '';
return `<img src="${url}" alt="${altText}" class="max-w-[400px] object-contain my-0" />`;
});
// Clean up markdown link syntax
cleaned = cleaned.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
if (cleaned.includes(`href="${url}"`)) {
return _match;
}
return `<a href="${url}" target="_blank" rel="noreferrer noopener" class="break-words inline-flex items-baseline gap-1">${text} <svg class="size-3" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" /></svg></a>`;
});
return cleaned;
}
/**
* Add proper CSS classes for styling
*/
function addStylingClasses(html: string): string {
let styled = html;
// Add strikethrough styling
styled = styled.replace(/<span class="line-through">([^<]+)<\/span>/g, '<span class="line-through line-through-2">$1</span>');
// Add subscript styling
styled = styled.replace(/<span class="subscript">([^<]+)<\/span>/g, '<span class="subscript text-xs align-sub">$1</span>');
// Add superscript styling
styled = styled.replace(/<span class="superscript">([^<]+)<\/span>/g, '<span class="superscript text-xs align-super">$1</span>');
// Add code highlighting classes
styled = styled.replace(/<pre class="highlightjs[^"]*">/g, '<pre class="highlightjs hljs">');
styled = styled.replace(/<code class="highlightjs[^"]*">/g, '<code class="highlightjs hljs">');
return styled;
}
/**
* Hide raw AsciiDoc ToC text
*/
function hideRawTocText(html: string): string {
let cleaned = html;
cleaned = cleaned.replace(
/<h[1-6][^>]*>.*?Table of Contents.*?\(\d+\).*?<\/h[1-6]>/gi,
''
);
cleaned = cleaned.replace(
/<p[^>]*>.*?Table of Contents.*?\(\d+\).*?<\/p>/gi,
''
);
cleaned = cleaned.replace(
/<p[^>]*>.*?Assumptions.*?\[n=0\].*?<\/p>/gi,
''
);
return cleaned;
}

211
src/processors/html-utils.ts

@ -0,0 +1,211 @@ @@ -0,0 +1,211 @@
/**
* Extracts the table of contents from AsciiDoc HTML output
* Returns the TOC HTML and the content HTML without the TOC
*/
export function extractTOC(html: string): { toc: string; contentWithoutTOC: string } {
// AsciiDoc with toc: 'left' generates a TOC in a div with id="toc" or class="toc"
let tocContent = '';
let contentWithoutTOC = html;
// Find the start of the TOC div - try multiple patterns
const tocStartPatterns = [
/<div\s+id=["']toc["']\s+class=["']toc["'][^>]*>/i,
/<div\s+id=["']toc["'][^>]*>/i,
/<div\s+class=["']toc["'][^>]*>/i,
/<nav\s+id=["']toc["'][^>]*>/i,
];
let tocStartIdx = -1;
let tocStartTag = '';
for (const pattern of tocStartPatterns) {
const match = html.match(pattern);
if (match && match.index !== undefined) {
tocStartIdx = match.index;
tocStartTag = match[0];
break;
}
}
if (tocStartIdx === -1) {
// No TOC found
return { toc: '', contentWithoutTOC: html };
}
// Find the matching closing tag by counting div/nav tags
const searchStart = tocStartIdx + tocStartTag.length;
let depth = 1;
let i = searchStart;
while (i < html.length && depth > 0) {
// Look for opening or closing div/nav tags
if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<div') {
// Check if it's a closing tag
if (i + 5 < html.length && html[i + 4] === '/') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else {
// Opening tag - find the end (handle attributes and self-closing)
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
// Check if it's self-closing (look for /> before the >)
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
}
} else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</div') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else if (i + 5 < html.length && html.substring(i, i + 5).toLowerCase() === '</nav') {
depth--;
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
i = closeIdx + 1;
} else if (i + 4 < html.length && html.substring(i, i + 4).toLowerCase() === '<nav') {
// Handle opening nav tags
const closeIdx = html.indexOf('>', i);
if (closeIdx === -1) break;
const tagContent = html.substring(i, closeIdx);
if (!tagContent.endsWith('/')) {
depth++;
}
i = closeIdx + 1;
} else {
i++;
}
}
if (depth === 0) {
// Found the matching closing tag
const tocEndIdx = i;
// Extract the TOC content (inner HTML)
const tocFullHTML = html.substring(tocStartIdx, tocEndIdx);
// Extract just the inner content (without the outer div tags)
let innerStart = tocStartTag.length;
let innerEnd = tocFullHTML.length;
// Find the last </div> or </nav>
if (tocFullHTML.endsWith('</div>')) {
innerEnd -= 6;
} else if (tocFullHTML.endsWith('</nav>')) {
innerEnd -= 7;
}
tocContent = tocFullHTML.substring(innerStart, innerEnd).trim();
// Remove the toctitle div if present (AsciiDoc adds "Table of Contents" title)
tocContent = tocContent.replace(/<div\s+id=["']toctitle["'][^>]*>.*?<\/div>\s*/gis, '');
tocContent = tocContent.trim();
// Remove the TOC from the content
contentWithoutTOC = html.substring(0, tocStartIdx) + html.substring(tocEndIdx);
}
return { toc: tocContent, contentWithoutTOC };
}
/**
* Performs basic HTML sanitization to prevent XSS
*/
export function sanitizeHTML(html: string): string {
// Remove script tags and their content
html = html.replace(/<script[^>]*>.*?<\/script>/gis, '');
// Remove event handlers (onclick, onerror, etc.)
html = html.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, '');
// Remove javascript: protocol in links
html = html.replace(/javascript:/gi, '');
// Remove data: URLs that could be dangerous
html = html.replace(/data:\s*text\/html/gi, '');
return html;
}
/**
* Processes HTML links to add target="_blank" to external links
* This function is available for use but not currently called automatically.
* It can be used in post-processing if needed.
*/
export function processLinks(html: string, linkBaseURL: string): string {
// Extract domain from linkBaseURL for comparison
let linkBaseDomain = '';
if (linkBaseURL) {
try {
// Use URL constructor if available (Node.js 10+)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const url = new URLConstructor(linkBaseURL);
linkBaseDomain = url.hostname;
} else {
throw new Error('URL not available');
}
} catch {
// Fallback to simple string parsing if URL constructor fails
const url = linkBaseURL.replace(/^https?:\/\//, '');
const parts = url.split('/');
if (parts.length > 0) {
linkBaseDomain = parts[0];
}
}
}
// Regex to match <a> tags with href attributes
const linkRegex = /<a\s+([^>]*?)href\s*=\s*["']([^"']+)["']([^>]*?)>/g;
return html.replace(linkRegex, (match, before, href, after) => {
// Check if it's an external link (starts with http:// or https://)
const isExternal = href.startsWith('http://') || href.startsWith('https://');
if (isExternal) {
// Check if it's pointing to our own domain
if (linkBaseDomain) {
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const URLConstructor = (globalThis as any).URL;
if (URLConstructor) {
const hrefUrl = new URLConstructor(href);
if (hrefUrl.hostname === linkBaseDomain) {
// Same domain - open in same tab (remove any existing target attribute)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
} else {
throw new Error('URL not available');
}
} catch {
// If URL parsing fails, use simple string check
if (href.includes(linkBaseDomain)) {
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
}
}
// External link - add target="_blank" and rel="noopener noreferrer" if not already present
if (!match.includes('target=')) {
if (!match.includes('rel=')) {
return match.replace('>', ' target="_blank" rel="noopener noreferrer">');
} else {
// Update existing rel attribute to include noopener if not present
const updatedMatch = match.replace(/rel\s*=\s*["']([^"']*)["']/gi, (relMatch, relValue) => {
if (!relValue.includes('noopener')) {
return `rel="${relValue} noopener noreferrer"`;
}
return relMatch;
});
return updatedMatch.replace('>', ' target="_blank">');
}
}
} else {
// Local/relative link - ensure it opens in same tab (remove target if present)
return match.replace(/\s*target\s*=\s*["'][^"']*["']/gi, '');
}
return match;
});
}

244
src/processors/markdown.ts

@ -1,244 +0,0 @@ @@ -1,244 +0,0 @@
import { marked } from 'marked';
// @ts-ignore - marked is ESM but we need it to work in Jest
import { ParserOptions } from '../types';
import * as emoji from 'node-emoji';
export interface MarkdownResult {
html: string;
frontmatter?: Record<string, any>;
hasLaTeX: boolean;
hasMusicalNotation: boolean;
}
/**
* Extract YAML frontmatter from markdown content
*/
function extractFrontmatter(content: string): { frontmatter?: Record<string, any>; content: string } {
const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/;
const match = content.match(frontmatterRegex);
if (!match) {
return { content };
}
try {
// Simple YAML parser for basic key-value pairs
const yamlContent = match[1];
const frontmatter: Record<string, any> = {};
const lines = yamlContent.split('\n');
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith('#')) continue;
const colonIndex = trimmed.indexOf(':');
if (colonIndex === -1) continue;
const key = trimmed.substring(0, colonIndex).trim();
let value = trimmed.substring(colonIndex + 1).trim();
// Remove quotes if present
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
// Handle arrays (simple case)
if (value.startsWith('[') && value.endsWith(']')) {
const arrayContent = value.slice(1, -1);
frontmatter[key] = arrayContent.split(',').map(v => v.trim().replace(/^["']|["']$/g, ''));
} else {
frontmatter[key] = value;
}
}
return {
frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : undefined,
content: content.substring(match[0].length)
};
} catch (e) {
return { content };
}
}
/**
* Process Markdown content to HTML (minimal markdown support)
*/
export function processMarkdown(content: string, options: ParserOptions): MarkdownResult {
// Extract frontmatter
const { frontmatter, content: contentWithoutFrontmatter } = extractFrontmatter(content);
// Detect LaTeX and musical notation
const hasLaTeX = /```latex|`\$\[|`\$\\|`\$\$|`\$\{|\$\$|\$\{|\$[^$]/.test(content);
const hasMusicalNotation = /```abc|```music/i.test(content);
// Configure marked for minimal markdown
marked.setOptions({
gfm: true,
breaks: false
});
// Process emoji shortcodes before markdown processing
let processedContent = emoji.emojify(contentWithoutFrontmatter);
// Extract and process footnotes before markdown parsing
// Footnotes format: [^1] in text and [^1]: definition at end
const footnoteDefinitions: Map<string, string> = new Map();
let placeholderCounter = 0;
// First, extract footnote definitions
const lines = processedContent.split('\n');
const processedLines: string[] = [];
let i = 0;
while (i < lines.length) {
const line = lines[i];
const footnoteDefMatch = line.match(/^\[\^([^\]]+)\]:\s*(.*)$/);
if (footnoteDefMatch) {
const id = footnoteDefMatch[1];
let definition = footnoteDefMatch[2];
// Collect multi-line definition (until next definition or blank line)
i++;
while (i < lines.length) {
const nextLine = lines[i];
if (nextLine.match(/^\[\^[^\]]+\]:/) || (nextLine.trim() === '' && i + 1 < lines.length && lines[i + 1].trim() !== '' && !lines[i + 1].match(/^\[\^[^\]]+\]:/))) {
break;
}
if (nextLine.trim() === '' && i + 1 < lines.length && lines[i + 1].match(/^\[\^[^\]]+\]:/)) {
break;
}
definition += '\n' + nextLine;
i++;
}
footnoteDefinitions.set(id, definition.trim());
// Skip adding this line to processedLines (removing the definition)
continue;
}
processedLines.push(line);
i++;
}
processedContent = processedLines.join('\n');
// Now replace footnote references with placeholders before markdown parsing
// Use HTML-like placeholder that markdown will pass through as-is
const footnoteRefRegex = /\[\^([^\]]+)\]/g;
let refMatch;
while ((refMatch = footnoteRefRegex.exec(processedContent)) !== null) {
const id = refMatch[1];
if (footnoteDefinitions.has(id)) {
const placeholder = `<span data-footnote-placeholder="${placeholderCounter++}" data-footnote-id="${id}"></span>`;
processedContent = processedContent.substring(0, refMatch.index) +
placeholder +
processedContent.substring(refMatch.index + refMatch[0].length);
// Reset regex since we modified the string
footnoteRefRegex.lastIndex = 0;
}
}
// Convert markdown to HTML
let html = marked.parse(processedContent) as string;
// Process superscripts in HTML (X^2^ syntax) - after markdown parsing to avoid conflicts
// But skip inside code blocks
const codeBlockRegex = /<(pre|code)[^>]*>[\s\S]*?<\/\1>/gi;
const codeBlocks: Array<{ start: number; end: number; content: string }> = [];
let codeMatch;
while ((codeMatch = codeBlockRegex.exec(html)) !== null) {
codeBlocks.push({
start: codeMatch.index,
end: codeMatch.index + codeMatch[0].length,
content: codeMatch[0]
});
}
function isInCodeBlock(index: number): boolean {
return codeBlocks.some(block => index >= block.start && index < block.end);
}
// Process superscripts
const superscriptRegex = /\^([^\^<>\n]+)\^/g;
const superscriptReplacements: Array<{ match: string; replacement: string; index: number }> = [];
let supMatch;
while ((supMatch = superscriptRegex.exec(html)) !== null) {
if (isInCodeBlock(supMatch.index)) continue;
superscriptReplacements.push({
match: supMatch[0],
replacement: `<sup>${supMatch[1]}</sup>`,
index: supMatch.index
});
}
// Apply superscript replacements in reverse order
superscriptReplacements.reverse().forEach(({ match, replacement, index }) => {
html = html.substring(0, index) + replacement + html.substring(index + match.length);
});
// Replace footnote placeholders with actual footnote HTML
let footnoteCounter = 1;
const footnoteRefs: Array<{ id: string; num: number; definition: string }> = [];
const footnoteRefMap: Map<string, number> = new Map();
// First, assign numbers to all footnote definitions
footnoteDefinitions.forEach((definition, id) => {
const num = footnoteCounter++;
footnoteRefMap.set(id, num);
footnoteRefs.push({ id, num, definition });
});
// Replace HTML span placeholders with footnote HTML
// Find all span elements with data-footnote-placeholder attribute
const placeholderRegex = /<span data-footnote-placeholder="(\d+)" data-footnote-id="([^"]+)"><\/span>/g;
html = html.replace(placeholderRegex, (match, placeholderNum, id) => {
const num = footnoteRefMap.get(id);
if (num !== undefined) {
return `<sup class="footnote"><a id="footnoteref_${num}" class="footnote" href="#footnotedef_${num}" title="View footnote.">${num}</a></sup>`;
}
return match; // Return original if no definition found
});
// Add footnotes section at the end if there are any
if (footnoteRefs.length > 0) {
let footnotesHtml = '<div id="footnotes"><hr>';
footnoteRefs.forEach(({ id, num, definition }) => {
// Process the definition through markdown again to handle formatting
const defHtml = marked.parse(definition) as string;
footnotesHtml += `<div class="footnote" id="footnotedef_${num}"><a href="#footnoteref_${num}">${num}</a>. ${defHtml}</div>`;
});
footnotesHtml += '</div>';
html += footnotesHtml;
}
// Fix anchor links - markdown headers need IDs
// Marked generates headers but may not have proper IDs for anchor links
// Process headers to add IDs based on their text content (if they don't already have one)
html = html.replace(/<h([1-6])([^>]*)>([^<]+)<\/h[1-6]>/gi, (match: string, level: string, attrs: string, text: string) => {
// Skip if header already has an id attribute
if (attrs && /id=["'][^"']+["']/i.test(attrs)) {
return match;
}
// Generate ID from header text (similar to GitHub markdown)
const id = text
.toLowerCase()
.trim()
.replace(/[^\w\s-]/g, '') // Remove special chars
.replace(/\s+/g, '-') // Replace spaces with hyphens
.replace(/-+/g, '-') // Replace multiple hyphens with single
.replace(/^-|-$/g, ''); // Remove leading/trailing hyphens
// Add id attribute
const newAttrs = attrs ? `${attrs} id="${id}"` : `id="${id}"`;
return `<h${level} ${newAttrs}>${text}</h${level}>`;
});
return {
html,
frontmatter,
hasLaTeX,
hasMusicalNotation
};
}

47
src/processors/music.ts

@ -0,0 +1,47 @@ @@ -0,0 +1,47 @@
/**
* Processes musical notation in HTML content
* Wraps musical notation in appropriate HTML for rendering
*/
export function processMusicalNotation(html: string): string {
// Process ABC notation blocks
const abcBlockPattern = /(X:\s*\d+[^\n]*\n(?:[^\n]+\n)*)/gs;
html = html.replace(abcBlockPattern, (match) => {
const abcContent = match.trim();
return `<div class="abc-notation" data-abc="${escapeForAttr(abcContent)}">${abcContent}</div>`;
});
// Process LilyPond notation blocks
const lilypondPattern = /(\\relative[^}]+})/gs;
html = html.replace(lilypondPattern, (match) => {
const lilypondContent = match.trim();
return `<div class="lilypond-notation" data-lilypond="${escapeForAttr(lilypondContent)}">${lilypondContent}</div>`;
});
// Process inline chord notation: [C], [Am], [F#m7], etc.
const chordPattern = /\[([A-G][#b]?m?[0-9]?[^\[\]]*)\]/g;
html = html.replace(chordPattern, (match, chord) => {
return `<span class="chord" data-chord="${escapeForAttr(chord)}">[${chord}]</span>`;
});
// Process MusicXML-like notation
const musicxmlPattern = /(<music[^>]*>.*?<\/music>)/gs;
html = html.replace(musicxmlPattern, (match) => {
const musicxmlContent = match.trim();
return `<div class="musicxml-notation" data-musicxml="${escapeForAttr(musicxmlContent)}">${musicxmlContent}</div>`;
});
return html;
}
/**
* Escapes a string for use in HTML attributes
*/
function escapeForAttr(text: string): string {
return text
.replace(/"/g, '&quot;')
.replace(/'/g, '&#39;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/\n/g, ' ')
.replace(/\r/g, '');
}

31
src/types.ts

@ -16,20 +16,6 @@ export interface ParserOptions { @@ -16,20 +16,6 @@ export interface ParserOptions {
enableMusicalNotation?: boolean;
/** Enable nostr: address processing (default: true) */
enableNostrAddresses?: boolean;
/**
* Custom URL format for wikilinks. Can be:
* - A string template with {dtag} placeholder: "/d/{dtag}" or "/events?d={dtag}"
* - A function that takes dtag and returns URL: (dtag: string) => `/d/${dtag}`
* Default: "/events?d={dtag}"
*/
wikilinkUrl?: string | ((dtag: string) => string);
/**
* Custom URL format for hashtags. Can be:
* - A string template with {topic} placeholder: "/notes?t={topic}" or "/hashtag/{topic}"
* - A function that takes topic (hashtag without #) and returns URL: (topic: string) => `/notes?t=${topic}`
* Default: undefined (hashtags rendered as non-clickable spans)
*/
hashtagUrl?: string | ((topic: string) => string);
}
/**
@ -63,8 +49,6 @@ export interface ProcessResult { @@ -63,8 +49,6 @@ export interface ProcessResult {
hasLaTeX: boolean;
/** Indicates if musical notation was found */
hasMusicalNotation: boolean;
/** Extracted YAML front matter (if present) */
frontmatter?: Record<string, any>;
/** Extracted Nostr links */
nostrLinks: NostrLink[];
/** Extracted wikilinks */
@ -72,11 +56,7 @@ export interface ProcessResult { @@ -72,11 +56,7 @@ export interface ProcessResult {
/** Extracted hashtags */
hashtags: string[];
/** Extracted regular links */
links: Array<{
url: string;
text: string;
isExternal: boolean;
}>;
links: Array<{ url: string; text: string; isExternal: boolean }>;
/** Extracted media URLs */
media: string[];
}
@ -85,8 +65,9 @@ export interface ProcessResult { @@ -85,8 +65,9 @@ export interface ProcessResult {
* Detected content format
*/
export enum ContentFormat {
Unknown = "unknown",
AsciiDoc = "asciidoc",
Markdown = "markdown",
Plain = "plain"
Unknown = 'unknown',
AsciiDoc = 'asciidoc',
Markdown = 'markdown',
Wikipedia = 'wikipedia',
Plain = 'plain'
}

20
src/types/asciidoctor.d.ts vendored

@ -0,0 +1,20 @@ @@ -0,0 +1,20 @@
/**
* Type declarations for @asciidoctor/core
* These are minimal types - the actual types should come from the package
*/
declare module '@asciidoctor/core' {
interface ConvertOptions {
safe?: string;
backend?: string;
doctype?: string;
attributes?: Record<string, any>;
extension_registry?: any;
}
interface Asciidoctor {
convert(content: string, options?: ConvertOptions): string | any;
}
function asciidoctor(): Asciidoctor;
export default asciidoctor;
}

1
tsconfig.json

@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
"target": "ES2020",
"module": "commonjs",
"lib": ["ES2020"],
"types": ["node"],
"outDir": "./dist",
"rootDir": "./src",
"strict": true,

Loading…
Cancel
Save