|
|
|
|
@ -1,18 +1,14 @@
@@ -1,18 +1,14 @@
|
|
|
|
|
/** |
|
|
|
|
* AsciiDoc Metadata Extraction Service |
|
|
|
|
* AsciiDoc Metadata Extraction Service using Asciidoctor |
|
|
|
|
*
|
|
|
|
|
* Extracts metadata from AsciiDoc document headers and section headers, |
|
|
|
|
* mapping them to Nostr event tags according to NKBIP-01 specification. |
|
|
|
|
*
|
|
|
|
|
* Document header structure: |
|
|
|
|
* = Document Title |
|
|
|
|
* Author Name <email@example.com> |
|
|
|
|
* version, date, revision info |
|
|
|
|
* :attribute: value |
|
|
|
|
*
|
|
|
|
|
* The first empty line marks the end of the header and start of the document body. |
|
|
|
|
* Thin wrapper around Asciidoctor's built-in metadata extraction capabilities. |
|
|
|
|
* Leverages the existing Pharos parser to avoid duplication. |
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
// @ts-ignore
|
|
|
|
|
import Processor from "asciidoctor"; |
|
|
|
|
import type { Document } from "asciidoctor"; |
|
|
|
|
|
|
|
|
|
export interface AsciiDocMetadata { |
|
|
|
|
title?: string; |
|
|
|
|
authors?: string[]; |
|
|
|
|
@ -30,7 +26,6 @@ export interface AsciiDocMetadata {
@@ -30,7 +26,6 @@ export interface AsciiDocMetadata {
|
|
|
|
|
autoUpdate?: 'yes' | 'ask' | 'no'; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Sections use the same metadata structure as documents
|
|
|
|
|
export type SectionMetadata = AsciiDocMetadata; |
|
|
|
|
|
|
|
|
|
export interface ParsedAsciiDoc { |
|
|
|
|
@ -43,448 +38,463 @@ export interface ParsedAsciiDoc {
@@ -43,448 +38,463 @@ export interface ParsedAsciiDoc {
|
|
|
|
|
}>; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Shared attribute mapping based on Asciidoctor standard attributes
|
|
|
|
|
const ATTRIBUTE_MAP: Record<string, keyof AsciiDocMetadata> = { |
|
|
|
|
// Standard Asciidoctor attributes
|
|
|
|
|
'author': 'authors', |
|
|
|
|
'description': 'summary', |
|
|
|
|
'keywords': 'tags', |
|
|
|
|
'revnumber': 'version', |
|
|
|
|
'revdate': 'publicationDate', |
|
|
|
|
'revremark': 'edition', |
|
|
|
|
'title': 'title', |
|
|
|
|
|
|
|
|
|
// Custom attributes for Alexandria
|
|
|
|
|
'published_by': 'publishedBy', |
|
|
|
|
'publisher': 'publisher', |
|
|
|
|
'summary': 'summary', |
|
|
|
|
'image': 'coverImage', |
|
|
|
|
'cover': 'coverImage', |
|
|
|
|
'isbn': 'isbn', |
|
|
|
|
'source': 'source', |
|
|
|
|
'type': 'type', |
|
|
|
|
'auto-update': 'autoUpdate', |
|
|
|
|
'version': 'version', |
|
|
|
|
'edition': 'edition', |
|
|
|
|
'published_on': 'publicationDate', |
|
|
|
|
'date': 'publicationDate', |
|
|
|
|
'version-label': 'version', |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Creates an Asciidoctor processor instance |
|
|
|
|
*/ |
|
|
|
|
function createProcessor() { |
|
|
|
|
return Processor(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Extracts tags from attributes, combining tags and keywords |
|
|
|
|
*/ |
|
|
|
|
function extractTagsFromAttributes(attributes: Record<string, any>): string[] { |
|
|
|
|
const tags: string[] = []; |
|
|
|
|
const attrTags = attributes['tags']; |
|
|
|
|
const attrKeywords = attributes['keywords']; |
|
|
|
|
|
|
|
|
|
if (attrTags && typeof attrTags === 'string') { |
|
|
|
|
tags.push(...attrTags.split(',').map(tag => tag.trim())); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (attrKeywords && typeof attrKeywords === 'string') { |
|
|
|
|
tags.push(...attrKeywords.split(',').map(tag => tag.trim())); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return [...new Set(tags)]; // Remove duplicates
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Shared function to parse metadata from attribute entries |
|
|
|
|
* @param metadata The metadata object to populate |
|
|
|
|
* @param key The attribute key |
|
|
|
|
* @param value The attribute value |
|
|
|
|
* Maps attributes to metadata with special handling for authors and tags |
|
|
|
|
*/ |
|
|
|
|
function parseMetadataAttribute(metadata: AsciiDocMetadata, key: string, value: string): void { |
|
|
|
|
switch (key.toLowerCase()) { |
|
|
|
|
case 'author': |
|
|
|
|
// Accumulate multiple authors
|
|
|
|
|
function mapAttributesToMetadata(attributes: Record<string, any>, metadata: AsciiDocMetadata, isDocument: boolean = false): void { |
|
|
|
|
for (const [key, value] of Object.entries(attributes)) { |
|
|
|
|
const metadataKey = ATTRIBUTE_MAP[key.toLowerCase()]; |
|
|
|
|
if (metadataKey && value && typeof value === 'string') { |
|
|
|
|
if (metadataKey === 'authors' && isDocument) { |
|
|
|
|
// Skip author mapping for documents since it's handled manually
|
|
|
|
|
continue; |
|
|
|
|
} else if (metadataKey === 'authors' && !isDocument) { |
|
|
|
|
// For sections, append author to existing authors array
|
|
|
|
|
if (!metadata.authors) { |
|
|
|
|
metadata.authors = []; |
|
|
|
|
} |
|
|
|
|
metadata.authors.push(value); |
|
|
|
|
break; |
|
|
|
|
case 'version': |
|
|
|
|
// Only set version if not already set from revision line
|
|
|
|
|
if (!metadata.version) { |
|
|
|
|
metadata.version = value; |
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
case 'edition': |
|
|
|
|
metadata.edition = value; |
|
|
|
|
break; |
|
|
|
|
case 'published_on': |
|
|
|
|
case 'date': |
|
|
|
|
metadata.publicationDate = value; |
|
|
|
|
break; |
|
|
|
|
case 'published_by': |
|
|
|
|
case 'publisher': |
|
|
|
|
// Only set publishedBy if not already set from revision line
|
|
|
|
|
if (!metadata.publishedBy) { |
|
|
|
|
metadata.publishedBy = value; |
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
case 'summary': |
|
|
|
|
case 'description': |
|
|
|
|
// Accumulate multiple summaries/descriptions
|
|
|
|
|
if (!metadata.summary) { |
|
|
|
|
metadata.summary = value; |
|
|
|
|
} else if (metadataKey === 'tags') { |
|
|
|
|
// Skip tags mapping since it's handled by extractTagsFromAttributes
|
|
|
|
|
continue; |
|
|
|
|
} else { |
|
|
|
|
// If we already have a summary, append this one
|
|
|
|
|
metadata.summary = metadata.summary + ' ' + value; |
|
|
|
|
(metadata as any)[metadataKey] = value; |
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
case 'image': |
|
|
|
|
case 'cover': |
|
|
|
|
metadata.coverImage = value; |
|
|
|
|
break; |
|
|
|
|
case 'isbn': |
|
|
|
|
metadata.isbn = value; |
|
|
|
|
break; |
|
|
|
|
case 'source': |
|
|
|
|
metadata.source = value; |
|
|
|
|
break; |
|
|
|
|
case 'type': |
|
|
|
|
metadata.type = value; |
|
|
|
|
break; |
|
|
|
|
case 'auto-update': |
|
|
|
|
if (value === 'yes' || value === 'ask' || value === 'no') { |
|
|
|
|
metadata.autoUpdate = value; |
|
|
|
|
} |
|
|
|
|
break; |
|
|
|
|
case 'tags': |
|
|
|
|
case 'keywords': |
|
|
|
|
// Accumulate multiple tag sets
|
|
|
|
|
if (!metadata.tags) { |
|
|
|
|
metadata.tags = []; |
|
|
|
|
} |
|
|
|
|
const newTags = value.split(',').map(tag => tag.trim()); |
|
|
|
|
metadata.tags.push(...newTags); |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Shared function to extract metadata from header lines |
|
|
|
|
* @param lines The lines to process |
|
|
|
|
* @param startLine The starting line index |
|
|
|
|
* @param metadata The metadata object to populate |
|
|
|
|
* @returns The index of the line after the header metadata |
|
|
|
|
* Extracts authors from header line (document or section) |
|
|
|
|
*/ |
|
|
|
|
function extractHeaderMetadata(lines: string[], startLine: number, metadata: AsciiDocMetadata): number { |
|
|
|
|
let currentLine = startLine; |
|
|
|
|
|
|
|
|
|
// Process the next two lines for author and revision info
|
|
|
|
|
let processedLines = 0; |
|
|
|
|
for (let i = 0; i < 2 && currentLine + i < lines.length; i++) { |
|
|
|
|
const line = lines[currentLine + i]; |
|
|
|
|
|
|
|
|
|
// Skip empty lines
|
|
|
|
|
if (line.trim() === '') { |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
function extractAuthorsFromHeader(sourceContent: string, isSection: boolean = false): string[] { |
|
|
|
|
const authors: string[] = []; |
|
|
|
|
const lines = sourceContent.split(/\r?\n/); |
|
|
|
|
const headerPattern = isSection ? /^==\s+/ : /^=\s+/; |
|
|
|
|
|
|
|
|
|
// Skip attribute lines (they'll be processed later)
|
|
|
|
|
if (line.startsWith(':')) { |
|
|
|
|
continue; |
|
|
|
|
for (let i = 0; i < lines.length; i++) { |
|
|
|
|
const line = lines[i]; |
|
|
|
|
if (line.match(headerPattern)) { |
|
|
|
|
// Found title line, check subsequent lines for authors
|
|
|
|
|
let j = i + 1; |
|
|
|
|
while (j < lines.length) { |
|
|
|
|
const authorLine = lines[j]; |
|
|
|
|
|
|
|
|
|
// Stop if we hit a blank line or content that's not an author
|
|
|
|
|
if (authorLine.trim() === '') { |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Check if this is an author line (contains <email>)
|
|
|
|
|
if (line.includes('<') && line.includes('>')) { |
|
|
|
|
const authorMatch = line.match(/^(.+?)\s*<(.+?)>$/); |
|
|
|
|
if (authorMatch) { |
|
|
|
|
const authorName = authorMatch[1].trim(); |
|
|
|
|
metadata.authors = [authorName]; |
|
|
|
|
processedLines++; |
|
|
|
|
continue; |
|
|
|
|
if (authorLine.includes('<') && !authorLine.startsWith(':')) { |
|
|
|
|
// This is an author line like "John Doe <john@example.com>"
|
|
|
|
|
const authorName = authorLine.split('<')[0].trim(); |
|
|
|
|
if (authorName) { |
|
|
|
|
authors.push(authorName); |
|
|
|
|
} |
|
|
|
|
} else if (isSection && authorLine.match(/^[A-Za-z\s]+$/) && authorLine.trim() !== '' && authorLine.trim().split(/\s+/).length <= 2) { |
|
|
|
|
// This is a simple author name without email (for sections)
|
|
|
|
|
authors.push(authorLine.trim()); |
|
|
|
|
} else if (authorLine.startsWith(':')) { |
|
|
|
|
// This is an attribute line, skip it - attributes are handled by mapAttributesToMetadata
|
|
|
|
|
// Don't break here, continue to next line
|
|
|
|
|
} else { |
|
|
|
|
// Not an author line, stop looking
|
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Check if this is a revision line (contains version, date, revision info)
|
|
|
|
|
const revisionMatch = line.match(/^(.+?),\s*(.+?),\s*(.+)$/); |
|
|
|
|
if (revisionMatch) { |
|
|
|
|
metadata.version = revisionMatch[1].trim(); |
|
|
|
|
metadata.publicationDate = revisionMatch[2].trim(); |
|
|
|
|
metadata.publishedBy = revisionMatch[3].trim(); |
|
|
|
|
processedLines++; |
|
|
|
|
continue; |
|
|
|
|
j++; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// If it's not author or revision, it might be a simple author name
|
|
|
|
|
if (!metadata.authors) { |
|
|
|
|
metadata.authors = [line.trim()]; |
|
|
|
|
processedLines++; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Move past the author/revision lines that were actually processed
|
|
|
|
|
currentLine += processedLines; |
|
|
|
|
return authors; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Process attribute entries (lines starting with :)
|
|
|
|
|
while (currentLine < lines.length) { |
|
|
|
|
const line = lines[currentLine]; |
|
|
|
|
/** |
|
|
|
|
* Strips header and attribute lines from content |
|
|
|
|
*/ |
|
|
|
|
function stripHeaderAndAttributes(content: string, isSection: boolean = false): string { |
|
|
|
|
const lines = content.split(/\r?\n/); |
|
|
|
|
let contentStart = 0; |
|
|
|
|
const headerPattern = isSection ? /^==\s+/ : /^=\s+/; |
|
|
|
|
|
|
|
|
|
// Empty line marks the end of the header
|
|
|
|
|
if (line.trim() === '') { |
|
|
|
|
for (let i = 0; i < lines.length; i++) { |
|
|
|
|
const line = lines[i]; |
|
|
|
|
// Skip title line, author line, revision line, and attribute lines
|
|
|
|
|
if (!line.match(headerPattern) && !line.includes('<') && !line.match(/^.+,\s*.+:\s*.+$/) &&
|
|
|
|
|
!line.match(/^:[^:]+:\s*.+$/) && line.trim() !== '') { |
|
|
|
|
contentStart = i; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Check for attribute entries
|
|
|
|
|
const attrMatch = line.match(/^:([^:]+):\s*(.+)$/); |
|
|
|
|
if (attrMatch) { |
|
|
|
|
const key = attrMatch[1].trim(); |
|
|
|
|
const value = attrMatch[2].trim(); |
|
|
|
|
parseMetadataAttribute(metadata, key, value); |
|
|
|
|
// Filter out all attribute lines and author lines from the content
|
|
|
|
|
const contentLines = lines.slice(contentStart); |
|
|
|
|
const filteredLines = contentLines.filter(line => { |
|
|
|
|
// Skip attribute lines
|
|
|
|
|
if (line.match(/^:[^:]+:\s*.+$/)) { |
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
// Skip author lines (simple names without email)
|
|
|
|
|
if (isSection && line.match(/^[A-Za-z\s]+$/) && line.trim() !== '' && line.trim().split(/\s+/).length <= 2) { |
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
return true; |
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
currentLine++; |
|
|
|
|
// Remove extra blank lines and normalize newlines
|
|
|
|
|
return filteredLines.join('\n').replace(/\n\s*\n\s*\n/g, '\n\n').replace(/\n\s*\n/g, '\n').trim(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return currentLine; |
|
|
|
|
/** |
|
|
|
|
* Parses attributes from section content |
|
|
|
|
*/ |
|
|
|
|
function parseSectionAttributes(sectionContent: string): Record<string, any> { |
|
|
|
|
const attributes: Record<string, any> = {}; |
|
|
|
|
const lines = sectionContent.split(/\r?\n/); |
|
|
|
|
|
|
|
|
|
for (const line of lines) { |
|
|
|
|
const match = line.match(/^:([^:]+):\s*(.+)$/); |
|
|
|
|
if (match) { |
|
|
|
|
const [, key, value] = match; |
|
|
|
|
attributes[key.trim()] = value.trim(); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return attributes; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Extracts metadata from AsciiDoc document header |
|
|
|
|
* @param content The full AsciiDoc content |
|
|
|
|
* @returns Object containing metadata and cleaned content |
|
|
|
|
* Extracts metadata from AsciiDoc document using Asciidoctor |
|
|
|
|
*/ |
|
|
|
|
export function extractDocumentMetadata(inputContent: string): { |
|
|
|
|
metadata: AsciiDocMetadata; |
|
|
|
|
content: string; |
|
|
|
|
} { |
|
|
|
|
const lines = inputContent.split(/\r?\n/); |
|
|
|
|
const asciidoctor = createProcessor(); |
|
|
|
|
const document = asciidoctor.load(inputContent, { standalone: false }) as Document; |
|
|
|
|
|
|
|
|
|
const metadata: AsciiDocMetadata = {}; |
|
|
|
|
let headerEndIndex = -1; |
|
|
|
|
let currentLine = 0; |
|
|
|
|
const attributes = document.getAttributes(); |
|
|
|
|
|
|
|
|
|
// Find the document title (first line starting with =)
|
|
|
|
|
for (let i = 0; i < lines.length; i++) { |
|
|
|
|
const line = lines[i]; |
|
|
|
|
const titleMatch = line.match(/^=\s+(.+)$/); |
|
|
|
|
if (titleMatch) { |
|
|
|
|
metadata.title = titleMatch[1].trim(); |
|
|
|
|
currentLine = i + 1; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
// Extract basic metadata
|
|
|
|
|
const title = document.getTitle(); |
|
|
|
|
if (title) metadata.title = title; |
|
|
|
|
|
|
|
|
|
// Handle multiple authors - combine header line and attributes
|
|
|
|
|
const authors = extractAuthorsFromHeader(document.getSource()); |
|
|
|
|
|
|
|
|
|
// Get authors from attributes (but avoid duplicates)
|
|
|
|
|
const attrAuthor = attributes['author']; |
|
|
|
|
if (attrAuthor && typeof attrAuthor === 'string' && !authors.includes(attrAuthor)) { |
|
|
|
|
authors.push(attrAuthor); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// If no document title found, return empty metadata
|
|
|
|
|
if (!metadata.title) { |
|
|
|
|
return { metadata: {}, content: inputContent }; |
|
|
|
|
if (authors.length > 0) { |
|
|
|
|
metadata.authors = [...new Set(authors)]; // Remove duplicates
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Check if this is an index card format (title followed immediately by "index card")
|
|
|
|
|
if (currentLine < lines.length && lines[currentLine].trim() === 'index card') { |
|
|
|
|
// This is index card format - content starts immediately after title
|
|
|
|
|
headerEndIndex = currentLine; |
|
|
|
|
} else { |
|
|
|
|
// Extract header metadata using shared function
|
|
|
|
|
currentLine = extractHeaderMetadata(lines, currentLine, metadata); |
|
|
|
|
// Extract revision info
|
|
|
|
|
const revisionNumber = document.getRevisionNumber(); |
|
|
|
|
if (revisionNumber) metadata.version = revisionNumber; |
|
|
|
|
|
|
|
|
|
// If we didn't find an empty line, the header ends at the first section
|
|
|
|
|
if (currentLine < lines.length && lines[currentLine].trim() === '') { |
|
|
|
|
headerEndIndex = currentLine + 1; // Skip the empty line
|
|
|
|
|
} else { |
|
|
|
|
for (let i = currentLine; i < lines.length; i++) { |
|
|
|
|
if (lines[i].match(/^==\s+/)) { |
|
|
|
|
headerEndIndex = i; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
// If no section found and no empty line, the header ends at the current line
|
|
|
|
|
if (headerEndIndex === -1) { |
|
|
|
|
headerEndIndex = currentLine; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
const revisionRemark = document.getRevisionRemark(); |
|
|
|
|
if (revisionRemark) metadata.publishedBy = revisionRemark; |
|
|
|
|
|
|
|
|
|
const revisionDate = document.getRevisionDate(); |
|
|
|
|
if (revisionDate) metadata.publicationDate = revisionDate; |
|
|
|
|
|
|
|
|
|
// If still no header end found, use the entire content
|
|
|
|
|
if (headerEndIndex === -1) { |
|
|
|
|
headerEndIndex = lines.length; |
|
|
|
|
// Map attributes to metadata (but skip version and publishedBy if we already have them from revision)
|
|
|
|
|
mapAttributesToMetadata(attributes, metadata, true); |
|
|
|
|
|
|
|
|
|
// If we got version from revision, don't override it with attribute
|
|
|
|
|
if (revisionNumber) { |
|
|
|
|
metadata.version = revisionNumber; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Extract the content (everything after the header)
|
|
|
|
|
let content = lines.slice(headerEndIndex).join('\n'); |
|
|
|
|
// If we got publishedBy from revision, don't override it with attribute
|
|
|
|
|
if (revisionRemark) { |
|
|
|
|
metadata.publishedBy = revisionRemark; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Remove metadata attributes from sections in the content
|
|
|
|
|
content = content.replace(/^:([^:]+):\s*(.+)$/gm, ''); |
|
|
|
|
// Handle tags and keywords
|
|
|
|
|
const tags = extractTagsFromAttributes(attributes); |
|
|
|
|
if (tags.length > 0) { |
|
|
|
|
metadata.tags = tags; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
const content = stripHeaderAndAttributes(document.getSource()); |
|
|
|
|
return { metadata, content }; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Extracts metadata from a section header |
|
|
|
|
* @param sectionContent The section content including its header |
|
|
|
|
* @returns Object containing section metadata and cleaned content |
|
|
|
|
* Extracts metadata from a section using Asciidoctor |
|
|
|
|
*/ |
|
|
|
|
export function extractSectionMetadata(inputSectionContent: string): { |
|
|
|
|
metadata: SectionMetadata; |
|
|
|
|
content: string; |
|
|
|
|
title: string; |
|
|
|
|
} { |
|
|
|
|
const lines = inputSectionContent.split(/\r?\n/); |
|
|
|
|
const metadata: SectionMetadata = {}; |
|
|
|
|
let title = ''; |
|
|
|
|
let headerEndIndex = -1; |
|
|
|
|
let currentLine = 0; |
|
|
|
|
const asciidoctor = createProcessor(); |
|
|
|
|
const document = asciidoctor.load(`= Temp\n\n${inputSectionContent}`, { standalone: false }) as Document; |
|
|
|
|
const sections = document.getSections(); |
|
|
|
|
|
|
|
|
|
// Find the section title (first line starting with ==)
|
|
|
|
|
for (let i = 0; i < lines.length; i++) { |
|
|
|
|
const line = lines[i]; |
|
|
|
|
const titleMatch = line.match(/^==\s+(.+)$/); |
|
|
|
|
if (titleMatch) { |
|
|
|
|
title = titleMatch[1].trim(); |
|
|
|
|
metadata.title = title; |
|
|
|
|
currentLine = i + 1; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// If no section title found, return empty metadata
|
|
|
|
|
if (!title) { |
|
|
|
|
if (sections.length === 0) { |
|
|
|
|
return { metadata: {}, content: inputSectionContent, title: '' }; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Extract header metadata using shared function
|
|
|
|
|
currentLine = extractHeaderMetadata(lines, currentLine, metadata); |
|
|
|
|
const section = sections[0]; |
|
|
|
|
const title = section.getTitle() || ''; |
|
|
|
|
const metadata: SectionMetadata = { title }; |
|
|
|
|
|
|
|
|
|
// If we didn't find an empty line, the header ends at the next section
|
|
|
|
|
if (currentLine < lines.length && lines[currentLine].trim() === '') { |
|
|
|
|
headerEndIndex = currentLine + 1; // Skip the empty line
|
|
|
|
|
} else { |
|
|
|
|
for (let i = currentLine; i < lines.length; i++) { |
|
|
|
|
if (lines[i].match(/^==\s+/)) { |
|
|
|
|
headerEndIndex = i; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
// Parse attributes from the section content
|
|
|
|
|
const attributes = parseSectionAttributes(inputSectionContent); |
|
|
|
|
|
|
|
|
|
// If still no header end found, use the entire content
|
|
|
|
|
if (headerEndIndex === -1) { |
|
|
|
|
headerEndIndex = lines.length; |
|
|
|
|
// Extract authors from section content
|
|
|
|
|
const authors = extractAuthorsFromHeader(inputSectionContent, true); |
|
|
|
|
if (authors.length > 0) { |
|
|
|
|
metadata.authors = authors; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Extract the content (everything after the header)
|
|
|
|
|
const content = lines.slice(headerEndIndex).join('\n'); |
|
|
|
|
// Map attributes to metadata (sections can have authors)
|
|
|
|
|
mapAttributesToMetadata(attributes, metadata, false); |
|
|
|
|
|
|
|
|
|
// Handle tags and keywords
|
|
|
|
|
const tags = extractTagsFromAttributes(attributes); |
|
|
|
|
if (tags.length > 0) { |
|
|
|
|
metadata.tags = tags; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
const content = stripHeaderAndAttributes(inputSectionContent, true); |
|
|
|
|
return { metadata, content, title }; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Splits AsciiDoc content into sections and extracts metadata from each |
|
|
|
|
* @param content The full AsciiDoc content |
|
|
|
|
* @returns Object containing document metadata and sections with their metadata |
|
|
|
|
* Parses AsciiDoc content into sections with metadata |
|
|
|
|
*/ |
|
|
|
|
export function parseAsciiDocWithMetadata(content: string): ParsedAsciiDoc { |
|
|
|
|
// First extract document metadata
|
|
|
|
|
const asciidoctor = createProcessor(); |
|
|
|
|
const document = asciidoctor.load(content, { standalone: false }) as Document; |
|
|
|
|
const { metadata: docMetadata } = extractDocumentMetadata(content); |
|
|
|
|
|
|
|
|
|
// Find the document header end to get the content after the header
|
|
|
|
|
// Parse the original content to find section attributes
|
|
|
|
|
const lines = content.split(/\r?\n/); |
|
|
|
|
let currentLine = 0; |
|
|
|
|
|
|
|
|
|
// Find the document title
|
|
|
|
|
for (let i = 0; i < lines.length; i++) { |
|
|
|
|
const line = lines[i]; |
|
|
|
|
const titleMatch = line.match(/^=\s+(.+)$/); |
|
|
|
|
if (titleMatch) { |
|
|
|
|
currentLine = i + 1; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Extract header metadata to find where content starts
|
|
|
|
|
const tempMetadata: AsciiDocMetadata = {}; |
|
|
|
|
currentLine = extractHeaderMetadata(lines, currentLine, tempMetadata); |
|
|
|
|
|
|
|
|
|
// Get the content after the header (including sections with metadata)
|
|
|
|
|
const docContent = lines.slice(currentLine).join('\n'); |
|
|
|
|
|
|
|
|
|
// Split into sections
|
|
|
|
|
const sections = splitAsciiDocSections(docContent); |
|
|
|
|
|
|
|
|
|
// Extract metadata from each section
|
|
|
|
|
const sectionsWithMetadata = sections.map(section => { |
|
|
|
|
return extractSectionMetadata(section); |
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
metadata: docMetadata, |
|
|
|
|
content: docContent, |
|
|
|
|
sections: sectionsWithMetadata |
|
|
|
|
}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Splits AsciiDoc content into sections at each '==' header |
|
|
|
|
* @param content The AsciiDoc content (without document header) |
|
|
|
|
* @returns Array of section strings |
|
|
|
|
*/ |
|
|
|
|
function splitAsciiDocSections(content: string): string[] { |
|
|
|
|
const lines = content.split(/\r?\n/); |
|
|
|
|
const sections: string[] = []; |
|
|
|
|
let currentSection: string[] = []; |
|
|
|
|
let inSection = false; |
|
|
|
|
const sectionsWithMetadata: Array<{ |
|
|
|
|
metadata: SectionMetadata; |
|
|
|
|
content: string; |
|
|
|
|
title: string; |
|
|
|
|
}> = []; |
|
|
|
|
let currentSection: string | null = null; |
|
|
|
|
let currentSectionContent: string[] = []; |
|
|
|
|
|
|
|
|
|
for (const line of lines) { |
|
|
|
|
// Check if this is a section header
|
|
|
|
|
if (line.match(/^==\s+/)) { |
|
|
|
|
// Save the previous section if we have one
|
|
|
|
|
if (inSection && currentSection.length > 0) { |
|
|
|
|
sections.push(currentSection.join('\n').trim()); |
|
|
|
|
currentSection = []; |
|
|
|
|
// Save previous section if exists
|
|
|
|
|
if (currentSection) { |
|
|
|
|
const sectionContent = currentSectionContent.join('\n'); |
|
|
|
|
sectionsWithMetadata.push(extractSectionMetadata(sectionContent)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Start new section
|
|
|
|
|
currentSection = [line]; |
|
|
|
|
inSection = true; |
|
|
|
|
} else if (inSection) { |
|
|
|
|
// Add line to current section
|
|
|
|
|
currentSection.push(line); |
|
|
|
|
currentSection = line; |
|
|
|
|
currentSectionContent = [line]; |
|
|
|
|
} else if (currentSection) { |
|
|
|
|
currentSectionContent.push(line); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Add the last section
|
|
|
|
|
if (currentSection.length > 0) { |
|
|
|
|
sections.push(currentSection.join('\n').trim()); |
|
|
|
|
// Save the last section
|
|
|
|
|
if (currentSection) { |
|
|
|
|
const sectionContent = currentSectionContent.join('\n'); |
|
|
|
|
sectionsWithMetadata.push(extractSectionMetadata(sectionContent)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return sections; |
|
|
|
|
return { |
|
|
|
|
metadata: docMetadata, |
|
|
|
|
content: document.getSource(), |
|
|
|
|
sections: sectionsWithMetadata |
|
|
|
|
}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Converts metadata to Nostr event tags |
|
|
|
|
* @param metadata The metadata object |
|
|
|
|
* @returns Array of [tag, value] pairs |
|
|
|
|
*/ |
|
|
|
|
export function metadataToTags(metadata: AsciiDocMetadata | SectionMetadata): [string, string][] { |
|
|
|
|
const tags: [string, string][] = []; |
|
|
|
|
|
|
|
|
|
// Don't add title to tags since it has its own dedicated field
|
|
|
|
|
// if (metadata.title) {
|
|
|
|
|
// tags.push(['title', metadata.title]);
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
if (metadata.authors && metadata.authors.length > 0) { |
|
|
|
|
metadata.authors.forEach(author => { |
|
|
|
|
tags.push(['author', author]); |
|
|
|
|
}); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.version) { |
|
|
|
|
tags.push(['version', metadata.version]); |
|
|
|
|
if (metadata.title) tags.push(['title', metadata.title]); |
|
|
|
|
if (metadata.authors?.length) { |
|
|
|
|
metadata.authors.forEach(author => tags.push(['author', author])); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.edition) { |
|
|
|
|
tags.push(['edition', metadata.edition]); |
|
|
|
|
if (metadata.version) tags.push(['version', metadata.version]); |
|
|
|
|
if (metadata.edition) tags.push(['edition', metadata.edition]); |
|
|
|
|
if (metadata.publicationDate) tags.push(['published_on', metadata.publicationDate]); |
|
|
|
|
if (metadata.publishedBy) tags.push(['published_by', metadata.publishedBy]); |
|
|
|
|
if (metadata.summary) tags.push(['summary', metadata.summary]); |
|
|
|
|
if (metadata.coverImage) tags.push(['image', metadata.coverImage]); |
|
|
|
|
if (metadata.isbn) tags.push(['i', metadata.isbn]); |
|
|
|
|
if (metadata.source) tags.push(['source', metadata.source]); |
|
|
|
|
if (metadata.type) tags.push(['type', metadata.type]); |
|
|
|
|
if (metadata.autoUpdate) tags.push(['auto-update', metadata.autoUpdate]); |
|
|
|
|
if (metadata.tags?.length) { |
|
|
|
|
metadata.tags.forEach(tag => tags.push(['t', tag])); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.publicationDate) { |
|
|
|
|
tags.push(['published_on', metadata.publicationDate]); |
|
|
|
|
return tags; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.publishedBy) { |
|
|
|
|
tags.push(['published_by', metadata.publishedBy]); |
|
|
|
|
/** |
|
|
|
|
* Removes metadata from AsciiDoc content |
|
|
|
|
*/ |
|
|
|
|
export function removeMetadataFromContent(content: string): string { |
|
|
|
|
const { content: cleanedContent } = extractDocumentMetadata(content); |
|
|
|
|
return cleanedContent; |
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (metadata.summary) { |
|
|
|
|
tags.push(['summary', metadata.summary]); |
|
|
|
|
} |
|
|
|
|
/** |
|
|
|
|
* Extracts metadata from content that only contains sections (no document header) |
|
|
|
|
* This is useful when content flows from ZettelEditor to EventInput |
|
|
|
|
*/ |
|
|
|
|
export function extractMetadataFromSectionsOnly(content: string): { |
|
|
|
|
metadata: AsciiDocMetadata; |
|
|
|
|
content: string; |
|
|
|
|
} { |
|
|
|
|
const lines = content.split(/\r?\n/); |
|
|
|
|
const sections: Array<{ |
|
|
|
|
metadata: SectionMetadata; |
|
|
|
|
content: string; |
|
|
|
|
title: string; |
|
|
|
|
}> = []; |
|
|
|
|
|
|
|
|
|
if (metadata.coverImage) { |
|
|
|
|
tags.push(['image', metadata.coverImage]); |
|
|
|
|
} |
|
|
|
|
let currentSection: string | null = null; |
|
|
|
|
let currentSectionContent: string[] = []; |
|
|
|
|
|
|
|
|
|
if (metadata.isbn) { |
|
|
|
|
tags.push(['i', metadata.isbn]); |
|
|
|
|
// Parse sections from the content
|
|
|
|
|
for (const line of lines) { |
|
|
|
|
if (line.match(/^==\s+/)) { |
|
|
|
|
// Save previous section if exists
|
|
|
|
|
if (currentSection) { |
|
|
|
|
const sectionContent = currentSectionContent.join('\n'); |
|
|
|
|
sections.push(extractSectionMetadata(sectionContent)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.source) { |
|
|
|
|
tags.push(['source', metadata.source]); |
|
|
|
|
// Start new section
|
|
|
|
|
currentSection = line; |
|
|
|
|
currentSectionContent = [line]; |
|
|
|
|
} else if (currentSection) { |
|
|
|
|
currentSectionContent.push(line); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.type) { |
|
|
|
|
tags.push(['type', metadata.type]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.autoUpdate) { |
|
|
|
|
tags.push(['auto-update', metadata.autoUpdate]); |
|
|
|
|
// Save the last section
|
|
|
|
|
if (currentSection) { |
|
|
|
|
const sectionContent = currentSectionContent.join('\n'); |
|
|
|
|
sections.push(extractSectionMetadata(sectionContent)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (metadata.tags && metadata.tags.length > 0) { |
|
|
|
|
metadata.tags.forEach(tag => { |
|
|
|
|
tags.push(['t', tag]); |
|
|
|
|
}); |
|
|
|
|
// For section-only content, we don't have document metadata
|
|
|
|
|
// Return the first section's title as the document title if available
|
|
|
|
|
const metadata: AsciiDocMetadata = {}; |
|
|
|
|
if (sections.length > 0 && sections[0].title) { |
|
|
|
|
metadata.title = sections[0].title; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return tags; |
|
|
|
|
return { metadata, content }; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* Removes metadata from AsciiDoc content, leaving only the actual content |
|
|
|
|
* @param content The full AsciiDoc content |
|
|
|
|
* @returns Cleaned content without metadata |
|
|
|
|
* Smart metadata extraction that handles both document headers and section-only content |
|
|
|
|
*/ |
|
|
|
|
export function removeMetadataFromContent(content: string): string { |
|
|
|
|
const { content: docContent } = extractDocumentMetadata(content); |
|
|
|
|
export function extractSmartMetadata(content: string): { |
|
|
|
|
metadata: AsciiDocMetadata; |
|
|
|
|
content: string; |
|
|
|
|
} { |
|
|
|
|
// Check if content has a document header
|
|
|
|
|
const hasDocumentHeader = content.match(/^=\s+/m); |
|
|
|
|
|
|
|
|
|
// Remove metadata attributes from sections in the content
|
|
|
|
|
const cleanedContent = docContent.replace(/^:([^:]+):\s*(.+)$/gm, ''); |
|
|
|
|
if (hasDocumentHeader) { |
|
|
|
|
// Check if it's a minimal document header (just title, no other metadata)
|
|
|
|
|
const lines = content.split(/\r?\n/); |
|
|
|
|
const titleLine = lines.find(line => line.match(/^=\s+/)); |
|
|
|
|
const hasOtherMetadata = lines.some(line =>
|
|
|
|
|
line.includes('<') || // author line
|
|
|
|
|
line.match(/^.+,\s*.+:\s*.+$/) // revision line
|
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
if (hasOtherMetadata) { |
|
|
|
|
// Full document with metadata - use standard extraction
|
|
|
|
|
return extractDocumentMetadata(content); |
|
|
|
|
} else { |
|
|
|
|
// Minimal document header (just title) - preserve the title line for 30040 events
|
|
|
|
|
const title = titleLine?.replace(/^=\s+/, '').trim(); |
|
|
|
|
const metadata: AsciiDocMetadata = {}; |
|
|
|
|
if (title) { |
|
|
|
|
metadata.title = title; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return cleanedContent; |
|
|
|
|
// Keep the title line in content for 30040 events
|
|
|
|
|
return { metadata, content }; |
|
|
|
|
} |
|
|
|
|
} else { |
|
|
|
|
return extractMetadataFromSectionsOnly(content); |
|
|
|
|
} |
|
|
|
|
}
|