You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
724 lines
20 KiB
724 lines
20 KiB
/** |
|
* AsciiDoc Metadata Extraction Service using Asciidoctor |
|
* |
|
* Thin wrapper around Asciidoctor's built-in metadata extraction capabilities. |
|
*/ |
|
|
|
// @ts-ignore |
|
import Processor from "asciidoctor"; |
|
import type { Document } from "asciidoctor"; |
|
|
|
export interface AsciiDocMetadata { |
|
title?: string; |
|
authors?: string[]; |
|
version?: string; |
|
edition?: string; |
|
publicationDate?: string; |
|
publisher?: string; |
|
summary?: string; |
|
coverImage?: string; |
|
isbn?: string; |
|
tags?: string[]; |
|
source?: string; |
|
publishedBy?: string; |
|
type?: string; |
|
autoUpdate?: "yes" | "ask" | "no"; |
|
customAttributes?: Record<string, string>; |
|
} |
|
|
|
export type SectionMetadata = AsciiDocMetadata; |
|
|
|
// Shared attribute mapping based on Asciidoctor standard attributes |
|
const ATTRIBUTE_MAP: Record<string, keyof AsciiDocMetadata> = { |
|
// Standard Asciidoctor attributes |
|
author: "authors", |
|
description: "summary", |
|
keywords: "tags", |
|
revnumber: "version", |
|
revdate: "publicationDate", |
|
revremark: "edition", |
|
title: "title", |
|
|
|
// Custom attributes for Alexandria |
|
published_by: "publishedBy", |
|
publisher: "publisher", |
|
summary: "summary", |
|
image: "coverImage", |
|
cover: "coverImage", |
|
isbn: "isbn", |
|
source: "source", |
|
type: "type", |
|
"auto-update": "autoUpdate", |
|
version: "version", |
|
edition: "edition", |
|
published_on: "publicationDate", |
|
date: "publicationDate", |
|
"version-label": "version", |
|
}; |
|
|
|
/** |
|
* Creates an Asciidoctor processor instance |
|
*/ |
|
function createProcessor() { |
|
return Processor(); |
|
} |
|
|
|
/** |
|
* Decodes HTML entities in a string |
|
*/ |
|
function decodeHtmlEntities(text: string): string { |
|
const entities: Record<string, string> = { |
|
"’": "'", |
|
"‘": "'", |
|
"“": '"', |
|
"”": '"', |
|
"&": "&", |
|
"<": "<", |
|
">": ">", |
|
""": '"', |
|
"'": "'", |
|
"'": "'", |
|
}; |
|
|
|
let result = text; |
|
for (const [entity, char] of Object.entries(entities)) { |
|
result = result.replace(new RegExp(entity, "g"), char); |
|
} |
|
return result; |
|
} |
|
|
|
/** |
|
* Extracts tags from attributes, combining tags and keywords |
|
*/ |
|
function extractTagsFromAttributes(attributes: Record<string, any>): string[] { |
|
const tags: string[] = []; |
|
const attrTags = attributes["tags"]; |
|
const attrKeywords = attributes["keywords"]; |
|
|
|
if (attrTags && typeof attrTags === "string") { |
|
tags.push(...attrTags.split(",").map((tag) => tag.trim())); |
|
} |
|
|
|
if (attrKeywords && typeof attrKeywords === "string") { |
|
tags.push(...attrKeywords.split(",").map((tag) => tag.trim())); |
|
} |
|
|
|
return [...new Set(tags)]; // Remove duplicates |
|
} |
|
|
|
/** |
|
* Maps attributes to metadata with special handling for authors and tags |
|
*/ |
|
function mapAttributesToMetadata( |
|
attributes: Record<string, any>, |
|
metadata: AsciiDocMetadata, |
|
isDocument: boolean = false, |
|
): void { |
|
for (const [key, value] of Object.entries(attributes)) { |
|
const metadataKey = ATTRIBUTE_MAP[key.toLowerCase()]; |
|
if (metadataKey && value && typeof value === "string") { |
|
if (metadataKey === "authors" && isDocument) { |
|
// Skip author mapping for documents since it's handled manually |
|
continue; |
|
} else if (metadataKey === "authors" && !isDocument) { |
|
// For sections, append author to existing authors array |
|
if (!metadata.authors) { |
|
metadata.authors = []; |
|
} |
|
metadata.authors.push(value); |
|
} else if (metadataKey === "tags") { |
|
// Skip tags mapping since it's handled by extractTagsFromAttributes |
|
continue; |
|
} else if (metadataKey === "summary") { |
|
// Handle summary specially - combine with existing summary if present |
|
if (metadata.summary) { |
|
metadata.summary = `${metadata.summary} ${value}`; |
|
} else { |
|
metadata.summary = value; |
|
} |
|
} else { |
|
(metadata as any)[metadataKey] = value; |
|
} |
|
} else if ( |
|
value && |
|
typeof value === "string" && |
|
!systemAttributes.includes(key) |
|
) { |
|
// Handle unknown/custom attributes - but only if they're not system attributes |
|
if (!metadata.customAttributes) { |
|
metadata.customAttributes = {}; |
|
} |
|
metadata.customAttributes[key] = value; |
|
} |
|
} |
|
} |
|
|
|
/** |
|
* Extracts authors from document header only (not sections) |
|
*/ |
|
function extractDocumentAuthors(sourceContent: string): string[] { |
|
const authors: string[] = []; |
|
const lines = sourceContent.split(/\r?\n/); |
|
|
|
// Find the document title line |
|
let titleLineIndex = -1; |
|
for (let i = 0; i < lines.length; i++) { |
|
if (lines[i].match(/^=\s+/)) { |
|
titleLineIndex = i; |
|
break; |
|
} |
|
} |
|
|
|
if (titleLineIndex === -1) { |
|
return authors; |
|
} |
|
|
|
// Look for authors in the lines immediately following the title |
|
let i = titleLineIndex + 1; |
|
while (i < lines.length) { |
|
const line = lines[i]; |
|
|
|
// Stop if we hit a blank line, section header, or content that's not an author |
|
if (line.trim() === "" || line.match(/^==\s+/)) { |
|
break; |
|
} |
|
|
|
if (line.includes("<") && !line.startsWith(":")) { |
|
// This is an author line like "John Doe <john@example.com>" |
|
const authorName = line.split("<")[0].trim(); |
|
if (authorName) { |
|
authors.push(authorName); |
|
} |
|
} else if (line.startsWith(":")) { |
|
// This is an attribute line, skip it |
|
// Don't break here, continue to next line |
|
} else { |
|
// Not an author line, stop looking |
|
break; |
|
} |
|
|
|
i++; |
|
} |
|
|
|
return authors; |
|
} |
|
|
|
/** |
|
* Extracts authors from section header only |
|
*/ |
|
function extractSectionAuthors(sectionContent: string): string[] { |
|
const authors: string[] = []; |
|
const lines = sectionContent.split(/\r?\n/); |
|
|
|
// Find the section title line |
|
let titleLineIndex = -1; |
|
for (let i = 0; i < lines.length; i++) { |
|
if (lines[i].match(/^==\s+/)) { |
|
titleLineIndex = i; |
|
break; |
|
} |
|
} |
|
|
|
if (titleLineIndex === -1) { |
|
return authors; |
|
} |
|
|
|
// Look for authors in the lines immediately following the section title |
|
let i = titleLineIndex + 1; |
|
while (i < lines.length) { |
|
const line = lines[i]; |
|
|
|
// Stop if we hit a blank line, another section header, or content that's not an author |
|
if (line.trim() === "" || line.match(/^==\s+/)) { |
|
break; |
|
} |
|
|
|
if (line.includes("<") && !line.startsWith(":")) { |
|
// This is an author line like "John Doe <john@example.com>" |
|
const authorName = line.split("<")[0].trim(); |
|
if (authorName) { |
|
authors.push(authorName); |
|
} |
|
} else if ( |
|
line.match(/^[A-Za-z\s]+$/) && |
|
line.trim() !== "" && |
|
line.trim().split(/\s+/).length <= 2 && |
|
!line.startsWith(":") |
|
) { |
|
// This is a simple author name without email (for sections) |
|
authors.push(line.trim()); |
|
} else if (line.startsWith(":")) { |
|
// This is an attribute line, skip it |
|
// Don't break here, continue to next line |
|
} else { |
|
// Not an author line, stop looking |
|
break; |
|
} |
|
|
|
i++; |
|
} |
|
|
|
return authors; |
|
} |
|
|
|
// System attributes to filter out when adding custom attributes as tags |
|
const systemAttributes = [ |
|
"attribute-undefined", |
|
"attribute-missing", |
|
"appendix-caption", |
|
"appendix-refsig", |
|
"caution-caption", |
|
"chapter-refsig", |
|
"example-caption", |
|
"figure-caption", |
|
"important-caption", |
|
"last-update-label", |
|
"manname-title", |
|
"note-caption", |
|
"part-refsig", |
|
"preface-title", |
|
"section-refsig", |
|
"table-caption", |
|
"tip-caption", |
|
"toc-title", |
|
"untitled-label", |
|
"version-label", |
|
"warning-caption", |
|
]; |
|
|
|
/** |
|
* Strips section header and attribute lines from content |
|
*/ |
|
function stripSectionHeader(sectionContent: string): string { |
|
const lines = sectionContent.split(/\r?\n/); |
|
let contentStart = 0; |
|
|
|
// Find where the section header ends |
|
for (let i = 0; i < lines.length; i++) { |
|
const line = lines[i]; |
|
// Skip section title line and attribute lines |
|
if ( |
|
!line.match(/^=+\s+/) && |
|
!line.includes("<") && |
|
!line.match(/^.+,\s*.+:\s*.+$/) && |
|
!line.match(/^:[^:]+:\s*.+$/) && |
|
line.trim() !== "" |
|
) { |
|
contentStart = i; |
|
break; |
|
} |
|
} |
|
|
|
const processedLines: string[] = []; |
|
let lastWasEmpty = false; |
|
|
|
for (let i = contentStart; i < lines.length; i++) { |
|
const line = lines[i]; |
|
|
|
// Skip attribute lines within content |
|
if (line.match(/^:[^:]+:\s*.+$/)) { |
|
continue; |
|
} |
|
|
|
// Handle empty lines - don't add more than one consecutive empty line |
|
if (line.trim() === "") { |
|
if (!lastWasEmpty) { |
|
processedLines.push(""); |
|
} |
|
lastWasEmpty = true; |
|
} else { |
|
processedLines.push(line); |
|
lastWasEmpty = false; |
|
} |
|
} |
|
|
|
// Remove extra blank lines and normalize newlines |
|
return processedLines |
|
.join("\n") |
|
.replace(/\n\s*\n\s*\n/g, "\n\n") |
|
.trim(); |
|
} |
|
|
|
/** |
|
* Strips document header and attribute lines from content |
|
*/ |
|
function stripDocumentHeader(content: string): string { |
|
const lines = content.split(/\r?\n/); |
|
let contentStart = 0; |
|
|
|
// Find the first line that is actual content (not header, author, or attribute) |
|
for (let i = 0; i < lines.length; i++) { |
|
const line = lines[i]; |
|
// Skip title line, author line, revision line, and attribute lines |
|
if ( |
|
!line.match(/^=\s+/) && |
|
!line.includes("<") && |
|
!line.match(/^.+,\s*.+:\s*.+$/) && |
|
!line.match(/^:[^:]+:\s*.+$/) && |
|
line.trim() !== "" |
|
) { |
|
contentStart = i; |
|
break; |
|
} |
|
} |
|
|
|
// Filter out all attribute lines and author lines from the content |
|
const contentLines = lines.slice(contentStart); |
|
const filteredLines = contentLines.filter((line) => { |
|
// Skip attribute lines |
|
if (line.match(/^:[^:]+:\s*.+$/)) { |
|
return false; |
|
} |
|
return true; |
|
}); |
|
|
|
// Ensure deeper headers (====) have proper newlines around them |
|
const processedLines = []; |
|
for (let i = 0; i < filteredLines.length; i++) { |
|
const line = filteredLines[i]; |
|
const prevLine = i > 0 ? filteredLines[i - 1] : ""; |
|
const nextLine = i < filteredLines.length - 1 ? filteredLines[i + 1] : ""; |
|
|
|
// If this is a deeper header (====+), ensure it has newlines around it |
|
if (line.match(/^====+\s+/)) { |
|
// Add newline before if previous line isn't blank |
|
if (prevLine && prevLine.trim() !== "") { |
|
processedLines.push(""); |
|
} |
|
processedLines.push(line); |
|
// Add newline after if next line isn't blank and exists |
|
if (nextLine && nextLine.trim() !== "") { |
|
processedLines.push(""); |
|
} |
|
} else { |
|
processedLines.push(line); |
|
} |
|
} |
|
|
|
// Remove extra blank lines and normalize newlines |
|
return processedLines |
|
.join("\n") |
|
.replace(/\n\s*\n\s*\n/g, "\n\n") |
|
.trim(); |
|
} |
|
|
|
/** |
|
* Parses attributes from section content using simple regex |
|
* Converts :tagname: tagvalue -> [tagname, tagvalue] |
|
* Converts :tags: comma,separated -> [t, tag1], [t, tag2], etc. |
|
*/ |
|
export function parseSimpleAttributes(content: string): [string, string][] { |
|
const tags: [string, string][] = []; |
|
const lines = content.split(/\r?\n/); |
|
|
|
for (const line of lines) { |
|
const match = line.match(/^:([^:]+):\s*(.+)$/); |
|
if (match) { |
|
const [, key, value] = match; |
|
const tagName = key.trim(); |
|
const tagValue = value.trim(); |
|
|
|
if (tagName === "tags") { |
|
// Special handling for :tags: - split into individual t-tags |
|
const tags_list = tagValue |
|
.split(",") |
|
.map((t) => t.trim()) |
|
.filter((t) => t.length > 0); |
|
tags_list.forEach((tag) => { |
|
tags.push(["t", tag]); |
|
}); |
|
} else { |
|
// Regular attribute -> [tagname, tagvalue] |
|
tags.push([tagName, tagValue]); |
|
} |
|
} |
|
} |
|
|
|
return tags; |
|
} |
|
|
|
/** |
|
* Extracts metadata from AsciiDoc document using Asciidoctor |
|
*/ |
|
export function extractDocumentMetadata(inputContent: string): { |
|
metadata: AsciiDocMetadata; |
|
content: string; |
|
} { |
|
const asciidoctor = createProcessor(); |
|
const document = asciidoctor.load(inputContent, { |
|
standalone: false, |
|
}) as Document; |
|
|
|
const metadata: AsciiDocMetadata = {}; |
|
const attributes = document.getAttributes(); |
|
|
|
// Extract basic metadata |
|
const title = document.getTitle(); |
|
if (title) metadata.title = decodeHtmlEntities(title); |
|
|
|
// Handle multiple authors - combine header line and attributes |
|
const authors = extractDocumentAuthors(document.getSource()); |
|
|
|
// Get authors from attributes in the document header only (including multiple :author: lines) |
|
const lines = document.getSource().split(/\r?\n/); |
|
let inDocumentHeader = true; |
|
for (const line of lines) { |
|
// Stop scanning when we hit a section header |
|
if (line.match(/^==\s+/)) { |
|
inDocumentHeader = false; |
|
break; |
|
} |
|
|
|
// Process :author: attributes regardless of other content |
|
if (inDocumentHeader) { |
|
const match = line.match(/^:author:\s*(.+)$/); |
|
if (match) { |
|
const authorName = match[1].trim(); |
|
if (authorName && !authors.includes(authorName)) { |
|
authors.push(authorName); |
|
} |
|
} |
|
} |
|
} |
|
|
|
if (authors.length > 0) { |
|
metadata.authors = [...new Set(authors)]; // Remove duplicates |
|
} |
|
|
|
// Extract revision info (only if it looks like valid revision data) |
|
const revisionNumber = document.getRevisionNumber(); |
|
if ( |
|
revisionNumber && |
|
revisionNumber !== "Version" && |
|
!revisionNumber.includes("==") |
|
) { |
|
metadata.version = revisionNumber; |
|
} |
|
|
|
const revisionRemark = document.getRevisionRemark(); |
|
if ( |
|
revisionRemark && |
|
!revisionRemark.includes("[NOTE]") && |
|
!revisionRemark.includes("==") |
|
) { |
|
metadata.publishedBy = revisionRemark; |
|
} |
|
|
|
const revisionDate = document.getRevisionDate(); |
|
if ( |
|
revisionDate && |
|
!revisionDate.includes("[NOTE]") && |
|
!revisionDate.includes("==") |
|
) { |
|
metadata.publicationDate = revisionDate; |
|
} |
|
|
|
// Map attributes to metadata (but skip version and publishedBy if we already have them from revision) |
|
mapAttributesToMetadata(attributes, metadata, true); |
|
|
|
// If we got version from revision, don't override it with attribute |
|
if (revisionNumber) { |
|
metadata.version = revisionNumber; |
|
} |
|
|
|
// If we got publishedBy from revision, don't override it with attribute |
|
if (revisionRemark) { |
|
metadata.publishedBy = revisionRemark; |
|
} |
|
|
|
// Handle tags and keywords |
|
const tags = extractTagsFromAttributes(attributes); |
|
if (tags.length > 0) { |
|
metadata.tags = tags; |
|
} |
|
|
|
const content = stripDocumentHeader(document.getSource()); |
|
return { metadata, content }; |
|
} |
|
|
|
/** |
|
* Extracts metadata from a section using Asciidoctor |
|
*/ |
|
export function extractSectionMetadata(inputSectionContent: string): { |
|
metadata: SectionMetadata; |
|
content: string; |
|
title: string; |
|
} { |
|
// Extract title directly from the content using regex for more control |
|
const titleMatch = inputSectionContent.match(/^(=+)\s+(.+)$/m); |
|
let title = ""; |
|
if (titleMatch) { |
|
title = titleMatch[2].trim(); |
|
} |
|
|
|
const metadata: SectionMetadata = { title }; |
|
|
|
// Extract authors from section content |
|
const authors = extractSectionAuthors(inputSectionContent); |
|
|
|
// Get authors from attributes (including multiple :author: lines) |
|
const lines = inputSectionContent.split(/\r?\n/); |
|
for (const line of lines) { |
|
const match = line.match(/^:author:\s*(.+)$/); |
|
if (match) { |
|
const authorName = match[1].trim(); |
|
if (authorName && !authors.includes(authorName)) { |
|
authors.push(authorName); |
|
} |
|
} |
|
} |
|
|
|
if (authors.length > 0) { |
|
metadata.authors = authors; |
|
} |
|
|
|
// Extract tags using parseSimpleAttributes (which is what's used in generateNostrEvents) |
|
const simpleAttrs = parseSimpleAttributes(inputSectionContent); |
|
const tags = simpleAttrs |
|
.filter((attr) => attr[0] === "t") |
|
.map((attr) => attr[1]); |
|
if (tags.length > 0) { |
|
metadata.tags = tags; |
|
} |
|
|
|
const content = stripSectionHeader(inputSectionContent); |
|
return { metadata, content, title }; |
|
} |
|
|
|
/** |
|
* Converts metadata to Nostr event tags |
|
*/ |
|
export function metadataToTags( |
|
metadata: AsciiDocMetadata | SectionMetadata, |
|
): [string, string][] { |
|
const tags: [string, string][] = []; |
|
|
|
if (metadata.title) tags.push(["title", metadata.title]); |
|
if (metadata.authors?.length) { |
|
metadata.authors.forEach((author) => tags.push(["author", author])); |
|
} |
|
if (metadata.version) tags.push(["version", metadata.version]); |
|
if (metadata.edition) tags.push(["edition", metadata.edition]); |
|
if (metadata.publicationDate) { |
|
tags.push(["published_on", metadata.publicationDate]); |
|
} |
|
if (metadata.publishedBy) tags.push(["published_by", metadata.publishedBy]); |
|
if (metadata.summary) tags.push(["summary", metadata.summary]); |
|
if (metadata.coverImage) tags.push(["image", metadata.coverImage]); |
|
if (metadata.isbn) tags.push(["i", metadata.isbn]); |
|
if (metadata.source) tags.push(["source", metadata.source]); |
|
if (metadata.type) tags.push(["type", metadata.type]); |
|
if (metadata.autoUpdate) tags.push(["auto-update", metadata.autoUpdate]); |
|
if (metadata.tags?.length) { |
|
metadata.tags.forEach((tag) => tags.push(["t", tag])); |
|
} |
|
|
|
// Add custom attributes as tags, but filter out system attributes |
|
if (metadata.customAttributes) { |
|
Object.entries(metadata.customAttributes).forEach(([key, value]) => { |
|
if (!systemAttributes.includes(key)) { |
|
tags.push([key, value]); |
|
} |
|
}); |
|
} |
|
|
|
return tags; |
|
} |
|
|
|
/** |
|
* Removes metadata from AsciiDoc content |
|
*/ |
|
export function removeMetadataFromContent(content: string): string { |
|
const { content: cleanedContent } = extractDocumentMetadata(content); |
|
return cleanedContent; |
|
} |
|
|
|
/** |
|
* Extracts metadata from content that only contains sections (no document header) |
|
* This is useful when content flows from ZettelEditor to EventInput |
|
*/ |
|
export function extractMetadataFromSectionsOnly(content: string): { |
|
metadata: AsciiDocMetadata; |
|
content: string; |
|
} { |
|
const lines = content.split(/\r?\n/); |
|
const sections: Array<{ |
|
metadata: SectionMetadata; |
|
content: string; |
|
title: string; |
|
}> = []; |
|
|
|
let currentSection: string | null = null; |
|
let currentSectionContent: string[] = []; |
|
|
|
// Parse sections from the content |
|
for (const line of lines) { |
|
if (line.match(/^==\s+/)) { |
|
// Save previous section if exists |
|
if (currentSection) { |
|
const sectionContent = currentSectionContent.join("\n"); |
|
sections.push(extractSectionMetadata(sectionContent)); |
|
} |
|
|
|
// Start new section |
|
currentSection = line; |
|
currentSectionContent = [line]; |
|
} else if (currentSection) { |
|
currentSectionContent.push(line); |
|
} |
|
} |
|
|
|
// Save the last section |
|
if (currentSection) { |
|
const sectionContent = currentSectionContent.join("\n"); |
|
sections.push(extractSectionMetadata(sectionContent)); |
|
} |
|
|
|
// For section-only content, we don't have document metadata |
|
// Return the first section's title as the document title if available |
|
const metadata: AsciiDocMetadata = {}; |
|
if (sections.length > 0 && sections[0].title) { |
|
metadata.title = sections[0].title; |
|
} |
|
|
|
return { metadata, content }; |
|
} |
|
|
|
/** |
|
* Smart metadata extraction that handles both document headers and section-only content |
|
*/ |
|
export function extractSmartMetadata(content: string): { |
|
metadata: AsciiDocMetadata; |
|
content: string; |
|
} { |
|
// Check if content has a document header |
|
const hasDocumentHeader = content.match(/^=\s+/m); |
|
|
|
if (hasDocumentHeader) { |
|
// Check if it's a minimal document header (just title, no other metadata) |
|
const lines = content.split(/\r?\n/); |
|
const titleLine = lines.find((line) => line.match(/^=\s+/)); |
|
const hasOtherMetadata = lines.some( |
|
(line) => |
|
line.includes("<") || // author line |
|
line.match(/^.+,\s*.+:\s*.+$/), // revision line |
|
); |
|
|
|
if (hasOtherMetadata) { |
|
// Full document with metadata - use standard extraction |
|
return extractDocumentMetadata(content); |
|
} else { |
|
// Minimal document header (just title) - preserve the title line for 30040 events |
|
const title = titleLine?.replace(/^=\s+/, "").trim(); |
|
const metadata: AsciiDocMetadata = {}; |
|
if (title) { |
|
metadata.title = title; |
|
} |
|
|
|
// Keep the title line in content for 30040 events |
|
return { metadata, content }; |
|
} |
|
} else { |
|
return extractMetadataFromSectionsOnly(content); |
|
} |
|
}
|
|
|