From 846147bef3f9689a9f643ba64b7bbfb3231f9a99 Mon Sep 17 00:00:00 2001 From: limina1 Date: Tue, 9 Sep 2025 11:38:17 -0400 Subject: [PATCH] add asciidoc_parser --- src/lib/utils/asciidoc_parser.ts | 577 +++++++++++++++++++++++++++++++ 1 file changed, 577 insertions(+) create mode 100644 src/lib/utils/asciidoc_parser.ts diff --git a/src/lib/utils/asciidoc_parser.ts b/src/lib/utils/asciidoc_parser.ts new file mode 100644 index 0000000..c8f057f --- /dev/null +++ b/src/lib/utils/asciidoc_parser.ts @@ -0,0 +1,577 @@ +/** + * AsciiDoc Content Parsing Service + * + * Handles parsing AsciiDoc content into hierarchical structures for publication. + * Separated from metadata extraction to maintain single responsibility principle. + */ + +// @ts-ignore +import Processor from "asciidoctor"; +import type { Document } from "asciidoctor"; +import { + parseSimpleAttributes, + extractDocumentMetadata, + extractSectionMetadata, +} from "./asciidoc_metadata.ts"; + +export interface ParsedAsciiDoc { + metadata: { + title?: string; + authors?: string[]; + version?: string; + edition?: string; + publicationDate?: string; + publisher?: string; + summary?: string; + coverImage?: string; + isbn?: string; + tags?: string[]; + source?: string; + publishedBy?: string; + type?: string; + autoUpdate?: "yes" | "ask" | "no"; + customAttributes?: Record; + }; + content: string; + title: string; + sections: Array<{ + metadata: { + title?: string; + authors?: string[]; + version?: string; + edition?: string; + publicationDate?: string; + publisher?: string; + summary?: string; + coverImage?: string; + isbn?: string; + tags?: string[]; + source?: string; + publishedBy?: string; + type?: string; + autoUpdate?: "yes" | "ask" | "no"; + customAttributes?: Record; + }; + content: string; + title: string; + }>; +} + +/** + * Creates an Asciidoctor processor instance + */ +function createProcessor() { + return Processor(); +} + +/** + * Helper function to determine the header level of a section + */ +function getSectionLevel(sectionContent: string): number { + const lines = sectionContent.split(/\r?\n/); + for (const line of lines) { + const match = line.match(/^(=+)\s+/); + if (match) { + return match[1].length; + } + } + return 0; +} + +/** + * Helper function to extract just the intro content (before first subsection) + */ +function extractIntroContent( + sectionContent: string, + currentLevel: number, +): string { + const lines = sectionContent.split(/\r?\n/); + const introLines: string[] = []; + let foundHeader = false; + + for (const line of lines) { + const headerMatch = line.match(/^(=+)\s+/); + if (headerMatch) { + const level = headerMatch[1].length; + if (level === currentLevel && !foundHeader) { + // This is the section header itself + foundHeader = true; + continue; // Skip the header line itself for intro content + } else if (level > currentLevel) { + // This is a subsection, stop collecting intro content + break; + } + } else if (foundHeader) { + // This is intro content after the header + introLines.push(line); + } + } + + return introLines.join("\n").trim(); +} + +/** + * Parses AsciiDoc content into sections with metadata + */ +export function parseAsciiDocWithMetadata(content: string): ParsedAsciiDoc { + const asciidoctor = createProcessor(); + const document = asciidoctor.load(content, { standalone: false }) as Document; + const { metadata: docMetadata } = extractDocumentMetadata(content); + + // Parse the original content to find section attributes + const lines = content.split(/\r?\n/); + const sectionsWithMetadata: Array<{ + metadata: ParsedAsciiDoc["sections"][0]["metadata"]; + content: string; + title: string; + }> = []; + let currentSection: string | null = null; + let currentSectionContent: string[] = []; + + for (const line of lines) { + if (line.match(/^==\s+/)) { + // Save previous section if exists + if (currentSection) { + const sectionContent = currentSectionContent.join("\n"); + sectionsWithMetadata.push(extractSectionMetadata(sectionContent)); + } + + // Start new section + currentSection = line; + currentSectionContent = [line]; + } else if (currentSection) { + currentSectionContent.push(line); + } + } + + // Save the last section + if (currentSection) { + const sectionContent = currentSectionContent.join("\n"); + sectionsWithMetadata.push(extractSectionMetadata(sectionContent)); + } + + return { + metadata: docMetadata, + content: document.getSource(), + title: docMetadata.title || "", + sections: sectionsWithMetadata, + }; +} + +/** + * Iterative AsciiDoc parsing based on specified level + * Level 2: Only == sections become content events (containing all subsections) + * Level 3: == sections become indices + content events, === sections become content events + * Level 4: === sections become indices + content events, ==== sections become content events, etc. + */ +export function parseAsciiDocIterative( + content: string, + parseLevel: number = 2, +): ParsedAsciiDoc { + const asciidoctor = createProcessor(); + const document = asciidoctor.load(content, { standalone: false }) as Document; + + // Extract document metadata using the metadata extraction functions + const { metadata: docMetadata } = extractDocumentMetadata(content); + + const lines = content.split(/\r?\n/); + const sections: Array<{ + metadata: ParsedAsciiDoc["sections"][0]["metadata"]; + content: string; + title: string; + }> = []; + + if (parseLevel === 2) { + // Level 2: Only == sections become events + const level2Pattern = /^==\s+/; + let currentSection: string | null = null; + let currentSectionContent: string[] = []; + let documentContent: string[] = []; + let inDocumentHeader = true; + + for (const line of lines) { + if (line.match(level2Pattern)) { + inDocumentHeader = false; + + // Save previous section if exists + if (currentSection) { + const sectionContent = currentSectionContent.join("\n"); + const sectionMeta = extractSectionMetadata(sectionContent); + // For level 2, preserve the full content including the header + sections.push({ + ...sectionMeta, + content: sectionContent, // Use full content, not stripped + }); + } + + // Start new section + currentSection = line; + currentSectionContent = [line]; + } else if (currentSection) { + currentSectionContent.push(line); + } else if (inDocumentHeader) { + documentContent.push(line); + } + } + + // Save the last section + if (currentSection) { + const sectionContent = currentSectionContent.join("\n"); + const sectionMeta = extractSectionMetadata(sectionContent); + // For level 2, preserve the full content including the header + sections.push({ + ...sectionMeta, + content: sectionContent, // Use full content, not stripped + }); + } + + const docContent = documentContent.join("\n"); + return { + metadata: docMetadata, + content: docContent, + title: docMetadata.title || "", + sections: sections, + }; + } + + // Level 3+: Parse hierarchically + // All levels from 2 to parseLevel-1 are indices (title only) + // Level parseLevel are content sections (full content) + + // First, collect all sections at the content level (parseLevel) + const contentLevelPattern = new RegExp(`^${"=".repeat(parseLevel)}\\s+`); + let currentSection: string | null = null; + let currentSectionContent: string[] = []; + let documentContent: string[] = []; + let inDocumentHeader = true; + + for (const line of lines) { + if (line.match(contentLevelPattern)) { + inDocumentHeader = false; + + // Save previous section if exists + if (currentSection) { + const sectionContent = currentSectionContent.join("\n"); + const sectionMeta = extractSectionMetadata(sectionContent); + sections.push({ + ...sectionMeta, + content: sectionContent, // Full content including headers + }); + } + + // Start new content section + currentSection = line; + currentSectionContent = [line]; + } else if (currentSection) { + // Continue collecting content for current section + currentSectionContent.push(line); + } else if (inDocumentHeader) { + documentContent.push(line); + } + } + + // Save the last section + if (currentSection) { + const sectionContent = currentSectionContent.join("\n"); + const sectionMeta = extractSectionMetadata(sectionContent); + sections.push({ + ...sectionMeta, + content: sectionContent, // Full content including headers + }); + } + + // Now collect index sections (all levels from 2 to parseLevel-1) + // These should be shown as navigation/structure but not full content + const indexSections: Array<{ + metadata: ParsedAsciiDoc["sections"][0]["metadata"]; + content: string; + title: string; + level: number; + }> = []; + + for (let level = 2; level < parseLevel; level++) { + const levelPattern = new RegExp(`^${"=".repeat(level)}\\s+(.+)$`, "gm"); + const matches = content.matchAll(levelPattern); + + for (const match of matches) { + const title = match[1].trim(); + indexSections.push({ + metadata: { title }, + content: `${"=".repeat(level)} ${title}`, // Just the header line for index sections + title, + level, + }); + } + } + + // Add actual level to content sections based on their content + const contentSectionsWithLevel = sections.map((s) => ({ + ...s, + level: getSectionLevel(s.content), + })); + + // Combine index sections and content sections + // Sort by position in original content to maintain order + const allSections = [...indexSections, ...contentSectionsWithLevel]; + + // Sort sections by their appearance in the original content + allSections.sort((a, b) => { + const posA = content.indexOf(a.content.split("\n")[0]); + const posB = content.indexOf(b.content.split("\n")[0]); + return posA - posB; + }); + + const docContent = documentContent.join("\n"); + return { + metadata: docMetadata, + content: docContent, + title: docMetadata.title || "", + sections: allSections, + }; +} + +/** + * Generates Nostr events from parsed AsciiDoc with proper hierarchical structure + * Based on docreference.md specifications + */ +export function generateNostrEvents( + parsed: ParsedAsciiDoc, + parseLevel: number = 2, + pubkey?: string, + maxDepth: number = 6, +): { + indexEvent?: any; + contentEvents: any[]; +} { + const allEvents: any[] = []; + const actualPubkey = pubkey || "pubkey"; + + // Helper function to generate section ID + const generateSectionId = (title: string): string => { + return title + .toLowerCase() + .replace(/[^\p{L}\p{N}]/gu, "-") + .replace(/-+/g, "-") + .replace(/^-|-$/g, ""); + }; + + // Build hierarchical tree structure + interface TreeNode { + section: { + metadata: any; + content: string; + title: string; + }; + level: number; + sectionId: string; + tags: [string, string][]; + children: TreeNode[]; + parent?: TreeNode; + } + + // Convert flat sections to tree structure + const buildTree = (): TreeNode[] => { + const roots: TreeNode[] = []; + const stack: TreeNode[] = []; + + for (const section of parsed.sections) { + const level = getSectionLevel(section.content); + const sectionId = generateSectionId(section.title); + const tags = parseSimpleAttributes(section.content); + + const node: TreeNode = { + section, + level, + sectionId, + tags, + children: [], + }; + + // Find the correct parent based on header hierarchy + while (stack.length > 0 && stack[stack.length - 1].level >= level) { + stack.pop(); + } + + if (stack.length === 0) { + // This is a root level section + roots.push(node); + } else { + // This is a child of the last item in stack + const parent = stack[stack.length - 1]; + parent.children.push(node); + node.parent = parent; + } + + stack.push(node); + } + + return roots; + }; + + const tree = buildTree(); + + // Recursively create events from tree + const createEventsFromNode = (node: TreeNode): void => { + const { section, level, sectionId, tags, children } = node; + + // Determine if this node should become an index + const hasChildrenAtTargetLevel = children.some( + (child) => child.level === parseLevel, + ); + const shouldBeIndex = + level < parseLevel && + (hasChildrenAtTargetLevel || + children.some((child) => child.level <= parseLevel)); + + if (shouldBeIndex) { + // Create content event for intro text (30041) + const introContent = extractIntroContent(section.content, level); + if (introContent.trim()) { + const contentEvent = { + id: "", + pubkey: "", + created_at: Math.floor(Date.now() / 1000), + kind: 30041, + tags: [ + ["d", `${sectionId}-content`], + ["title", section.title], + ...tags, + ], + content: introContent, + sig: "", + }; + allEvents.push(contentEvent); + } + + // Create index event (30040) + const childATags: string[][] = []; + + // Add a-tag for intro content if it exists + if (introContent.trim()) { + childATags.push([ + "a", + `30041:${actualPubkey}:${sectionId}-content`, + "", + "", + ]); + } + + // Add a-tags for direct children + for (const child of children) { + const childHasSubChildren = child.children.some( + (grandchild) => grandchild.level <= parseLevel, + ); + const childShouldBeIndex = + child.level < parseLevel && childHasSubChildren; + const childKind = childShouldBeIndex ? 30040 : 30041; + childATags.push([ + "a", + `${childKind}:${actualPubkey}:${child.sectionId}`, + "", + "", + ]); + } + + const indexEvent = { + id: "", + pubkey: "", + created_at: Math.floor(Date.now() / 1000), + kind: 30040, + tags: [ + ["d", sectionId], + ["title", section.title], + ...tags, + ...childATags, + ], + content: "", + sig: "", + }; + allEvents.push(indexEvent); + } else { + // Create regular content event (30041) + const contentEvent = { + id: "", + pubkey: "", + created_at: Math.floor(Date.now() / 1000), + kind: 30041, + tags: [["d", sectionId], ["title", section.title], ...tags], + content: section.content, + sig: "", + }; + allEvents.push(contentEvent); + } + + // Recursively process children + for (const child of children) { + createEventsFromNode(child); + } + }; + + // Process all root level sections + for (const rootNode of tree) { + createEventsFromNode(rootNode); + } + + // Create main document index if we have a document title (article format) + if (parsed.title && parsed.title.trim() !== "") { + const documentId = generateSectionId(parsed.title); + const documentTags = parseSimpleAttributes(parsed.content); + + // Create a-tags for all root level sections (level 2) + const mainIndexATags = tree.map((rootNode) => { + const hasSubChildren = rootNode.children.some( + (child) => child.level <= parseLevel, + ); + const shouldBeIndex = rootNode.level < parseLevel && hasSubChildren; + const kind = shouldBeIndex ? 30040 : 30041; + return ["a", `${kind}:${actualPubkey}:${rootNode.sectionId}`, "", ""]; + }); + + console.log("Debug: Root sections found:", tree.length); + console.log("Debug: Main index a-tags:", mainIndexATags); + + const mainIndexEvent = { + id: "", + pubkey: "", + created_at: Math.floor(Date.now() / 1000), + kind: 30040, + tags: [ + ["d", documentId], + ["title", parsed.title], + ...documentTags, + ...mainIndexATags, + ], + content: "", + sig: "", + }; + + return { + indexEvent: mainIndexEvent, + contentEvents: allEvents, + }; + } + + // For scattered notes, return only content events + return { + contentEvents: allEvents, + }; +} + +/** + * Detects content type for smart publishing + */ +export function detectContentType( + content: string, +): "article" | "scattered-notes" | "none" { + const hasDocTitle = + content.trim().startsWith("=") && !content.trim().startsWith("=="); + const hasSections = content.includes("=="); + + if (hasDocTitle) { + return "article"; + } else if (hasSections) { + return "scattered-notes"; + } else { + return "none"; + } +}