clone of repo on github
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

762 lines
24 KiB

/**
* AsciiDoc Metadata Extraction Service using Asciidoctor
*
* Thin wrapper around Asciidoctor's built-in metadata extraction capabilities.
* Leverages the existing Pharos parser to avoid duplication.
*/
// @ts-ignore
import Processor from "asciidoctor";
import type { Document } from "asciidoctor";
export interface AsciiDocMetadata {
title?: string;
authors?: string[];
version?: string;
edition?: string;
publicationDate?: string;
publisher?: string;
summary?: string;
coverImage?: string;
isbn?: string;
tags?: string[];
source?: string;
publishedBy?: string;
type?: string;
autoUpdate?: 'yes' | 'ask' | 'no';
customAttributes?: Record<string, string>;
}
export type SectionMetadata = AsciiDocMetadata;
export interface ParsedAsciiDoc {
metadata: AsciiDocMetadata;
content: string;
title: string;
sections: Array<{
metadata: SectionMetadata;
content: string;
title: string;
}>;
}
// Shared attribute mapping based on Asciidoctor standard attributes
const ATTRIBUTE_MAP: Record<string, keyof AsciiDocMetadata> = {
// Standard Asciidoctor attributes
'author': 'authors',
'description': 'summary',
'keywords': 'tags',
'revnumber': 'version',
'revdate': 'publicationDate',
'revremark': 'edition',
'title': 'title',
// Custom attributes for Alexandria
'published_by': 'publishedBy',
'publisher': 'publisher',
'summary': 'summary',
'image': 'coverImage',
'cover': 'coverImage',
'isbn': 'isbn',
'source': 'source',
'type': 'type',
'auto-update': 'autoUpdate',
'version': 'version',
'edition': 'edition',
'published_on': 'publicationDate',
'date': 'publicationDate',
'version-label': 'version',
};
/**
* Creates an Asciidoctor processor instance
*/
function createProcessor() {
return Processor();
}
/**
* Extracts tags from attributes, combining tags and keywords
*/
function extractTagsFromAttributes(attributes: Record<string, any>): string[] {
const tags: string[] = [];
const attrTags = attributes['tags'];
const attrKeywords = attributes['keywords'];
if (attrTags && typeof attrTags === 'string') {
tags.push(...attrTags.split(',').map(tag => tag.trim()));
}
if (attrKeywords && typeof attrKeywords === 'string') {
tags.push(...attrKeywords.split(',').map(tag => tag.trim()));
}
return [...new Set(tags)]; // Remove duplicates
}
/**
* Maps attributes to metadata with special handling for authors and tags
*/
function mapAttributesToMetadata(attributes: Record<string, any>, metadata: AsciiDocMetadata, isDocument: boolean = false): void {
// List of AsciiDoc system attributes to ignore
const systemAttributes = [
'attribute-undefined', 'attribute-missing', 'appendix-caption', 'appendix-refsig',
'caution-caption', 'chapter-refsig', 'example-caption', 'figure-caption',
'important-caption', 'last-update-label', 'note-caption', 'part-refsig',
'section-refsig', 'table-caption', 'tip-caption', 'toc-placement',
'toc-title', 'untitled-label', 'warning-caption', 'asciidoctor-version',
'safe-mode-name', 'backend', 'user-home', 'doctype', 'htmlsyntax',
'outfilesuffix', 'filetype', 'basebackend', 'stylesdir', 'iconsdir',
'localdate', 'localyear', 'localtime', 'localdatetime', 'docdate',
'docyear', 'doctime', 'docdatetime', 'doctitle', 'language',
'firstname', 'authorinitials', 'authors'
];
for (const [key, value] of Object.entries(attributes)) {
const metadataKey = ATTRIBUTE_MAP[key.toLowerCase()];
if (metadataKey && value && typeof value === 'string') {
if (metadataKey === 'authors' && isDocument) {
// Skip author mapping for documents since it's handled manually
continue;
} else if (metadataKey === 'authors' && !isDocument) {
// For sections, append author to existing authors array
if (!metadata.authors) {
metadata.authors = [];
}
metadata.authors.push(value);
} else if (metadataKey === 'tags') {
// Skip tags mapping since it's handled by extractTagsFromAttributes
continue;
} else {
(metadata as any)[metadataKey] = value;
}
} else if (value && typeof value === 'string' && !systemAttributes.includes(key)) {
// Handle unknown/custom attributes - but only if they're not system attributes
if (!metadata.customAttributes) {
metadata.customAttributes = {};
}
metadata.customAttributes[key] = value;
}
}
}
/**
* Extracts authors from header line (document or section)
*/
function extractAuthorsFromHeader(sourceContent: string, isSection: boolean = false): string[] {
const authors: string[] = [];
const lines = sourceContent.split(/\r?\n/);
const headerPattern = isSection ? /^==\s+/ : /^=\s+/;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line.match(headerPattern)) {
// Found title line, check subsequent lines for authors
let j = i + 1;
while (j < lines.length) {
const authorLine = lines[j];
// Stop if we hit a blank line or content that's not an author
if (authorLine.trim() === '') {
break;
}
// Skip section headers at any level (they start with ==, ===, etc.)
if (authorLine.match(/^==+\s+/)) {
// This is a section header, stop looking for authors
break;
}
if (authorLine.includes('<') && !authorLine.startsWith(':')) {
// This is an author line like "John Doe <john@example.com>"
const authorName = authorLine.split('<')[0].trim();
if (authorName) {
authors.push(authorName);
}
} else if (isSection && authorLine.match(/^[A-Za-z\s]+$/) && authorLine.trim() !== '' &&
authorLine.trim().split(/\s+/).length <= 2) {
// This is a simple author name without email (for sections)
authors.push(authorLine.trim());
} else if (authorLine.startsWith(':')) {
// This is an attribute line, skip it - attributes are handled by mapAttributesToMetadata
// Don't break here, continue to next line
} else {
// Not an author line, stop looking
break;
}
j++;
}
break;
}
}
return authors;
}
/**
* Strips header and attribute lines from content
*/
function stripHeaderAndAttributes(content: string, isSection: boolean = false): string {
const lines = content.split(/\r?\n/);
let contentStart = 0;
const headerPattern = isSection ? /^==\s+/ : /^=\s+/;
// Find the first line that is actual content (not header, author, or attribute)
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Skip title line, author line, revision line, and attribute lines
if (!line.match(headerPattern) && !line.includes('<') && !line.match(/^.+,\s*.+:\s*.+$/) &&
!line.match(/^:[^:]+:\s*.+$/) && line.trim() !== '') {
contentStart = i;
break;
}
}
// Filter out all attribute lines and author lines from the content
const contentLines = lines.slice(contentStart);
const filteredLines = contentLines.filter(line => {
// Skip attribute lines
if (line.match(/^:[^:]+:\s*.+$/)) {
return false;
}
// Skip author lines (simple names without email)
if (isSection && line.match(/^[A-Za-z\s]+$/) && line.trim() !== '' && line.trim().split(/\s+/).length <= 2) {
return false;
}
return true;
});
// Ensure deeper headers (====) have proper newlines around them
const processedLines = [];
for (let i = 0; i < filteredLines.length; i++) {
const line = filteredLines[i];
const prevLine = i > 0 ? filteredLines[i - 1] : '';
const nextLine = i < filteredLines.length - 1 ? filteredLines[i + 1] : '';
// If this is a deeper header (====+), ensure it has newlines around it
if (line.match(/^====+\s+/)) {
// Add newline before if previous line isn't blank
if (prevLine && prevLine.trim() !== '') {
processedLines.push('');
}
processedLines.push(line);
// Add newline after if next line isn't blank and exists
if (nextLine && nextLine.trim() !== '') {
processedLines.push('');
}
} else {
processedLines.push(line);
}
}
// Remove extra blank lines and normalize newlines
return processedLines.join('\n').replace(/\n\s*\n\s*\n/g, '\n\n').trim();
}
/**
* Parses attributes from section content using simple regex
* Converts :tagname: tagvalue -> [tagname, tagvalue]
* Converts :tags: comma,separated -> [t, tag1], [t, tag2], etc.
*/
function parseSimpleAttributes(content: string): [string, string][] {
const tags: [string, string][] = [];
const lines = content.split(/\r?\n/);
for (const line of lines) {
const match = line.match(/^:([^:]+):\s*(.+)$/);
if (match) {
const [, key, value] = match;
const tagName = key.trim();
const tagValue = value.trim();
if (tagName === 'tags') {
// Special handling for :tags: - split into individual t-tags
const tags_list = tagValue.split(',').map(t => t.trim()).filter(t => t.length > 0);
tags_list.forEach(tag => {
tags.push(['t', tag]);
});
} else {
// Regular attribute -> [tagname, tagvalue]
tags.push([tagName, tagValue]);
}
}
}
return tags;
}
/**
* Extracts metadata from AsciiDoc document using Asciidoctor
*/
export function extractDocumentMetadata(inputContent: string): {
metadata: AsciiDocMetadata;
content: string;
} {
const asciidoctor = createProcessor();
const document = asciidoctor.load(inputContent, { standalone: false }) as Document;
const metadata: AsciiDocMetadata = {};
const attributes = document.getAttributes();
// Extract basic metadata
const title = document.getTitle();
if (title) metadata.title = title;
// Handle multiple authors - combine header line and attributes
const authors = extractAuthorsFromHeader(document.getSource());
// Get authors from attributes (but avoid duplicates)
const attrAuthor = attributes['author'];
if (attrAuthor && typeof attrAuthor === 'string' && !authors.includes(attrAuthor)) {
authors.push(attrAuthor);
}
if (authors.length > 0) {
metadata.authors = [...new Set(authors)]; // Remove duplicates
}
// Extract revision info (only if it looks like valid revision data)
const revisionNumber = document.getRevisionNumber();
if (revisionNumber && revisionNumber !== 'Version' && !revisionNumber.includes('==')) {
metadata.version = revisionNumber;
}
const revisionRemark = document.getRevisionRemark();
if (revisionRemark && !revisionRemark.includes('[NOTE]') && !revisionRemark.includes('==')) {
metadata.publishedBy = revisionRemark;
}
const revisionDate = document.getRevisionDate();
if (revisionDate && !revisionDate.includes('[NOTE]') && !revisionDate.includes('==')) {
metadata.publicationDate = revisionDate;
}
// Map attributes to metadata (but skip version and publishedBy if we already have them from revision)
mapAttributesToMetadata(attributes, metadata, true);
// If we got version from revision, don't override it with attribute
if (revisionNumber) {
metadata.version = revisionNumber;
}
// If we got publishedBy from revision, don't override it with attribute
if (revisionRemark) {
metadata.publishedBy = revisionRemark;
}
// Handle tags and keywords
const tags = extractTagsFromAttributes(attributes);
if (tags.length > 0) {
metadata.tags = tags;
}
const content = stripHeaderAndAttributes(document.getSource());
return { metadata, content };
}
/**
* Extracts metadata from a section using Asciidoctor
*/
export function extractSectionMetadata(inputSectionContent: string): {
metadata: SectionMetadata;
content: string;
title: string;
} {
const asciidoctor = createProcessor();
const document = asciidoctor.load(`= Temp\n\n${inputSectionContent}`, { standalone: false }) as Document;
const sections = document.getSections();
if (sections.length === 0) {
return { metadata: {}, content: inputSectionContent, title: '' };
}
const section = sections[0];
const title = section.getTitle() || '';
const metadata: SectionMetadata = { title };
// Parse attributes from the section content (no longer used - we use simple parsing in generateNostrEvents)
const attributes = {};
// Extract authors from section content
const authors = extractAuthorsFromHeader(inputSectionContent, true);
if (authors.length > 0) {
metadata.authors = authors;
}
// Map attributes to metadata (sections can have authors)
mapAttributesToMetadata(attributes, metadata, false);
// Handle tags and keywords
const tags = extractTagsFromAttributes(attributes);
if (tags.length > 0) {
metadata.tags = tags;
}
const content = stripHeaderAndAttributes(inputSectionContent, true);
return { metadata, content, title };
}
/**
* Parses AsciiDoc content into sections with metadata
*/
export function parseAsciiDocWithMetadata(content: string): ParsedAsciiDoc {
const asciidoctor = createProcessor();
const document = asciidoctor.load(content, { standalone: false }) as Document;
const { metadata: docMetadata } = extractDocumentMetadata(content);
// Parse the original content to find section attributes
const lines = content.split(/\r?\n/);
const sectionsWithMetadata: Array<{
metadata: SectionMetadata;
content: string;
title: string;
}> = [];
let currentSection: string | null = null;
let currentSectionContent: string[] = [];
for (const line of lines) {
if (line.match(/^==\s+/)) {
// Save previous section if exists
if (currentSection) {
const sectionContent = currentSectionContent.join('\n');
sectionsWithMetadata.push(extractSectionMetadata(sectionContent));
}
// Start new section
currentSection = line;
currentSectionContent = [line];
} else if (currentSection) {
currentSectionContent.push(line);
}
}
// Save the last section
if (currentSection) {
const sectionContent = currentSectionContent.join('\n');
sectionsWithMetadata.push(extractSectionMetadata(sectionContent));
}
return {
metadata: docMetadata,
content: document.getSource(),
title: docMetadata.title || '',
sections: sectionsWithMetadata
};
}
/**
* Converts metadata to Nostr event tags
*/
export function metadataToTags(metadata: AsciiDocMetadata | SectionMetadata): [string, string][] {
const tags: [string, string][] = [];
if (metadata.title) tags.push(['title', metadata.title]);
if (metadata.authors?.length) {
metadata.authors.forEach(author => tags.push(['author', author]));
}
if (metadata.version) tags.push(['version', metadata.version]);
if (metadata.edition) tags.push(['edition', metadata.edition]);
if (metadata.publicationDate) tags.push(['published_on', metadata.publicationDate]);
if (metadata.publishedBy) tags.push(['published_by', metadata.publishedBy]);
if (metadata.summary) tags.push(['summary', metadata.summary]);
if (metadata.coverImage) tags.push(['image', metadata.coverImage]);
if (metadata.isbn) tags.push(['i', metadata.isbn]);
if (metadata.source) tags.push(['source', metadata.source]);
if (metadata.type) tags.push(['type', metadata.type]);
if (metadata.autoUpdate) tags.push(['auto-update', metadata.autoUpdate]);
if (metadata.tags?.length) {
metadata.tags.forEach(tag => tags.push(['t', tag]));
}
// Add custom attributes as tags, but filter out system attributes
if (metadata.customAttributes) {
const systemAttributes = [
'attribute-undefined', 'attribute-missing', 'appendix-caption', 'appendix-refsig',
'caution-caption', 'chapter-refsig', 'example-caption', 'figure-caption',
'important-caption', 'last-update-label', 'note-caption', 'part-refsig',
'section-refsig', 'table-caption', 'tip-caption', 'toc-placement',
'toc-title', 'untitled-label', 'warning-caption', 'asciidoctor-version',
'safe-mode-name', 'backend', 'user-home', 'doctype', 'htmlsyntax',
'outfilesuffix', 'filetype', 'basebackend', 'stylesdir', 'iconsdir',
'localdate', 'localyear', 'localtime', 'localdatetime', 'docdate',
'docyear', 'doctime', 'docdatetime', 'doctitle', 'language',
'firstname', 'authorinitials', 'authors'
];
Object.entries(metadata.customAttributes).forEach(([key, value]) => {
if (!systemAttributes.includes(key)) {
tags.push([key, value]);
}
});
}
return tags;
}
/**
* Removes metadata from AsciiDoc content
*/
export function removeMetadataFromContent(content: string): string {
const { content: cleanedContent } = extractDocumentMetadata(content);
return cleanedContent;
}
/**
* Extracts metadata from content that only contains sections (no document header)
* This is useful when content flows from ZettelEditor to EventInput
*/
export function extractMetadataFromSectionsOnly(content: string): {
metadata: AsciiDocMetadata;
content: string;
} {
const lines = content.split(/\r?\n/);
const sections: Array<{
metadata: SectionMetadata;
content: string;
title: string;
}> = [];
let currentSection: string | null = null;
let currentSectionContent: string[] = [];
// Parse sections from the content
for (const line of lines) {
if (line.match(/^==\s+/)) {
// Save previous section if exists
if (currentSection) {
const sectionContent = currentSectionContent.join('\n');
sections.push(extractSectionMetadata(sectionContent));
}
// Start new section
currentSection = line;
currentSectionContent = [line];
} else if (currentSection) {
currentSectionContent.push(line);
}
}
// Save the last section
if (currentSection) {
const sectionContent = currentSectionContent.join('\n');
sections.push(extractSectionMetadata(sectionContent));
}
// For section-only content, we don't have document metadata
// Return the first section's title as the document title if available
const metadata: AsciiDocMetadata = {};
if (sections.length > 0 && sections[0].title) {
metadata.title = sections[0].title;
}
return { metadata, content };
}
/**
* Iterative AsciiDoc parsing based on specified level
* Level 2: Only == sections become events (containing all subsections)
* Level 3: == sections become indices, === sections become events
* Level 4: === sections become indices, ==== sections become events, etc.
*/
export function parseAsciiDocIterative(content: string, parseLevel: number = 2): ParsedAsciiDoc {
const asciidoctor = createProcessor();
const document = asciidoctor.load(content, { standalone: false }) as Document;
const { metadata: docMetadata } = extractDocumentMetadata(content);
const lines = content.split(/\r?\n/);
const targetHeaderPattern = new RegExp(`^${'='.repeat(parseLevel)}\\s+`);
const sections: Array<{
metadata: SectionMetadata;
content: string;
title: string;
}> = [];
let currentSection: string | null = null;
let currentSectionContent: string[] = [];
let documentContent: string[] = [];
let inDocumentHeader = true;
for (const line of lines) {
// Check if we've hit the first section at our target level
if (line.match(targetHeaderPattern)) {
inDocumentHeader = false;
// Save previous section if exists
if (currentSection) {
const sectionContent = currentSectionContent.join('\n');
sections.push(extractSectionMetadata(sectionContent));
}
// Start new section
currentSection = line;
currentSectionContent = [line];
} else if (currentSection) {
// We're in a section - add content
currentSectionContent.push(line);
} else if (inDocumentHeader) {
// We're still in document content (before first section)
documentContent.push(line);
}
}
// Save the last section
if (currentSection) {
const sectionContent = currentSectionContent.join('\n');
sections.push(extractSectionMetadata(sectionContent));
}
// Extract document content (everything before first section at target level)
// Keep the original content with attributes for simple parsing
const docContent = documentContent.join('\n');
return {
metadata: docMetadata,
content: docContent,
title: docMetadata.title || '',
sections: sections
};
}
/**
* Generates Nostr events from parsed AsciiDoc
* Based on docreference.md specifications
*/
export function generateNostrEvents(parsed: ParsedAsciiDoc, parseLevel: number = 2, pubkey?: string): {
indexEvent?: any;
contentEvents: any[];
} {
const events: any[] = [];
// Create content events for each section (30041)
const contentEvents = parsed.sections.map(section => {
const sectionId = section.title
.toLowerCase()
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, '-')
.trim();
// Extract tags directly from section content using simple regex
const sectionTags = parseSimpleAttributes(section.content);
return {
id: '', // Will be generated by Nostr client
pubkey: '', // Will be set by client
created_at: Math.floor(Date.now() / 1000),
kind: 30041,
tags: [
['d', sectionId],
['title', section.title],
...sectionTags
],
content: section.content,
sig: '' // Will be generated by client
};
});
// Only create index event if we have a document title (article format)
if (parsed.title && parsed.title.trim() !== '') {
// Generate document identifier from title
const documentId = parsed.title
.toLowerCase()
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, '-')
.trim();
// Extract tags directly from document content using simple regex
const documentTags = parseSimpleAttributes(parsed.content);
// Create main index event (30040)
const indexEvent = {
id: '', // Will be generated by Nostr client
pubkey: '', // Will be set by client
created_at: Math.floor(Date.now() / 1000),
kind: 30040,
tags: [
['d', documentId],
['title', parsed.title],
...documentTags,
// Add a-tags for each section
...parsed.sections.map(section => {
const sectionId = section.title
.toLowerCase()
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, '-')
.trim();
const actualPubkey = pubkey || 'pubkey'; // Use actual pubkey if provided, fallback for compatibility
return ['a', `30041:${actualPubkey}:${sectionId}`, '', '']; // relay will be filled by client
})
],
content: '', // Index events have empty content
sig: '' // Will be generated by client
};
return {
indexEvent,
contentEvents
};
}
// For scattered notes, return only content events
return {
contentEvents
};
}
/**
* Detects content type for smart publishing
*/
export function detectContentType(content: string): 'article' | 'scattered-notes' | 'none' {
const hasDocTitle = content.trim().startsWith('=') && !content.trim().startsWith('==');
const hasSections = content.includes('==');
if (hasDocTitle) {
return 'article';
} else if (hasSections) {
return 'scattered-notes';
} else {
return 'none';
}
}
/**
* Smart metadata extraction that handles both document headers and section-only content
*/
export function extractSmartMetadata(content: string): {
metadata: AsciiDocMetadata;
content: string;
} {
// Check if content has a document header
const hasDocumentHeader = content.match(/^=\s+/m);
if (hasDocumentHeader) {
// Check if it's a minimal document header (just title, no other metadata)
const lines = content.split(/\r?\n/);
const titleLine = lines.find(line => line.match(/^=\s+/));
const hasOtherMetadata = lines.some(line =>
line.includes('<') || // author line
line.match(/^.+,\s*.+:\s*.+$/) // revision line
);
if (hasOtherMetadata) {
// Full document with metadata - use standard extraction
return extractDocumentMetadata(content);
} else {
// Minimal document header (just title) - preserve the title line for 30040 events
const title = titleLine?.replace(/^=\s+/, '').trim();
const metadata: AsciiDocMetadata = {};
if (title) {
metadata.title = title;
}
// Keep the title line in content for 30040 events
return { metadata, content };
}
} else {
return extractMetadataFromSectionsOnly(content);
}
}