add asciidoc_parser

8 months ago · 846147bef3
1 changed files with 577 additions and 0 deletions
--- a/src/lib/utils/asciidoc_parser.ts
+++ b/src/lib/utils/asciidoc_parser.ts
@ -0,0 +1,577 @@
				@@ -0,0 +1,577 @@
+/**
+ * AsciiDoc Content Parsing Service
+ *
+ * Handles parsing AsciiDoc content into hierarchical structures for publication.
+ * Separated from metadata extraction to maintain single responsibility principle.
+ */
+
+// @ts-ignore
+import Processor from "asciidoctor";
+import type { Document } from "asciidoctor";
+import {
+  parseSimpleAttributes,
+  extractDocumentMetadata,
+  extractSectionMetadata,
+} from "./asciidoc_metadata.ts";
+
+export interface ParsedAsciiDoc {
+  metadata: {
+    title?: string;
+    authors?: string[];
+    version?: string;
+    edition?: string;
+    publicationDate?: string;
+    publisher?: string;
+    summary?: string;
+    coverImage?: string;
+    isbn?: string;
+    tags?: string[];
+    source?: string;
+    publishedBy?: string;
+    type?: string;
+    autoUpdate?: "yes" | "ask" | "no";
+    customAttributes?: Record<string, string>;
+  };
+  content: string;
+  title: string;
+  sections: Array<{
+    metadata: {
+      title?: string;
+      authors?: string[];
+      version?: string;
+      edition?: string;
+      publicationDate?: string;
+      publisher?: string;
+      summary?: string;
+      coverImage?: string;
+      isbn?: string;
+      tags?: string[];
+      source?: string;
+      publishedBy?: string;
+      type?: string;
+      autoUpdate?: "yes" | "ask" | "no";
+      customAttributes?: Record<string, string>;
+    };
+    content: string;
+    title: string;
+  }>;
+}
+
+/**
+ * Creates an Asciidoctor processor instance
+ */
+function createProcessor() {
+  return Processor();
+}
+
+/**
+ * Helper function to determine the header level of a section
+ */
+function getSectionLevel(sectionContent: string): number {
+  const lines = sectionContent.split(/\r?\n/);
+  for (const line of lines) {
+    const match = line.match(/^(=+)\s+/);
+    if (match) {
+      return match[1].length;
+    }
+  }
+  return 0;
+}
+
+/**
+ * Helper function to extract just the intro content (before first subsection)
+ */
+function extractIntroContent(
+  sectionContent: string,
+  currentLevel: number,
+): string {
+  const lines = sectionContent.split(/\r?\n/);
+  const introLines: string[] = [];
+  let foundHeader = false;
+
+  for (const line of lines) {
+    const headerMatch = line.match(/^(=+)\s+/);
+    if (headerMatch) {
+      const level = headerMatch[1].length;
+      if (level === currentLevel && !foundHeader) {
+        // This is the section header itself
+        foundHeader = true;
+        continue; // Skip the header line itself for intro content
+      } else if (level > currentLevel) {
+        // This is a subsection, stop collecting intro content
+        break;
+      }
+    } else if (foundHeader) {
+      // This is intro content after the header
+      introLines.push(line);
+    }
+  }
+
+  return introLines.join("\n").trim();
+}
+
+/**
+ * Parses AsciiDoc content into sections with metadata
+ */
+export function parseAsciiDocWithMetadata(content: string): ParsedAsciiDoc {
+  const asciidoctor = createProcessor();
+  const document = asciidoctor.load(content, { standalone: false }) as Document;
+  const { metadata: docMetadata } = extractDocumentMetadata(content);
+
+  // Parse the original content to find section attributes
+  const lines = content.split(/\r?\n/);
+  const sectionsWithMetadata: Array<{
+    metadata: ParsedAsciiDoc["sections"][0]["metadata"];
+    content: string;
+    title: string;
+  }> = [];
+  let currentSection: string | null = null;
+  let currentSectionContent: string[] = [];
+
+  for (const line of lines) {
+    if (line.match(/^==\s+/)) {
+      // Save previous section if exists
+      if (currentSection) {
+        const sectionContent = currentSectionContent.join("\n");
+        sectionsWithMetadata.push(extractSectionMetadata(sectionContent));
+      }
+
+      // Start new section
+      currentSection = line;
+      currentSectionContent = [line];
+    } else if (currentSection) {
+      currentSectionContent.push(line);
+    }
+  }
+
+  // Save the last section
+  if (currentSection) {
+    const sectionContent = currentSectionContent.join("\n");
+    sectionsWithMetadata.push(extractSectionMetadata(sectionContent));
+  }
+
+  return {
+    metadata: docMetadata,
+    content: document.getSource(),
+    title: docMetadata.title || "",
+    sections: sectionsWithMetadata,
+  };
+}
+
+/**
+ * Iterative AsciiDoc parsing based on specified level
+ * Level 2: Only == sections become content events (containing all subsections)
+ * Level 3: == sections become indices + content events, === sections become content events
+ * Level 4: === sections become indices + content events, ==== sections become content events, etc.
+ */
+export function parseAsciiDocIterative(
+  content: string,
+  parseLevel: number = 2,
+): ParsedAsciiDoc {
+  const asciidoctor = createProcessor();
+  const document = asciidoctor.load(content, { standalone: false }) as Document;
+
+  // Extract document metadata using the metadata extraction functions
+  const { metadata: docMetadata } = extractDocumentMetadata(content);
+
+  const lines = content.split(/\r?\n/);
+  const sections: Array<{
+    metadata: ParsedAsciiDoc["sections"][0]["metadata"];
+    content: string;
+    title: string;
+  }> = [];
+
+  if (parseLevel === 2) {
+    // Level 2: Only == sections become events
+    const level2Pattern = /^==\s+/;
+    let currentSection: string | null = null;
+    let currentSectionContent: string[] = [];
+    let documentContent: string[] = [];
+    let inDocumentHeader = true;
+
+    for (const line of lines) {
+      if (line.match(level2Pattern)) {
+        inDocumentHeader = false;
+
+        // Save previous section if exists
+        if (currentSection) {
+          const sectionContent = currentSectionContent.join("\n");
+          const sectionMeta = extractSectionMetadata(sectionContent);
+          // For level 2, preserve the full content including the header
+          sections.push({
+            ...sectionMeta,
+            content: sectionContent, // Use full content, not stripped
+          });
+        }
+
+        // Start new section
+        currentSection = line;
+        currentSectionContent = [line];
+      } else if (currentSection) {
+        currentSectionContent.push(line);
+      } else if (inDocumentHeader) {
+        documentContent.push(line);
+      }
+    }
+
+    // Save the last section
+    if (currentSection) {
+      const sectionContent = currentSectionContent.join("\n");
+      const sectionMeta = extractSectionMetadata(sectionContent);
+      // For level 2, preserve the full content including the header
+      sections.push({
+        ...sectionMeta,
+        content: sectionContent, // Use full content, not stripped
+      });
+    }
+
+    const docContent = documentContent.join("\n");
+    return {
+      metadata: docMetadata,
+      content: docContent,
+      title: docMetadata.title || "",
+      sections: sections,
+    };
+  }
+
+  // Level 3+: Parse hierarchically
+  // All levels from 2 to parseLevel-1 are indices (title only)
+  // Level parseLevel are content sections (full content)
+
+  // First, collect all sections at the content level (parseLevel)
+  const contentLevelPattern = new RegExp(`^${"=".repeat(parseLevel)}\\s+`);
+  let currentSection: string | null = null;
+  let currentSectionContent: string[] = [];
+  let documentContent: string[] = [];
+  let inDocumentHeader = true;
+
+  for (const line of lines) {
+    if (line.match(contentLevelPattern)) {
+      inDocumentHeader = false;
+
+      // Save previous section if exists
+      if (currentSection) {
+        const sectionContent = currentSectionContent.join("\n");
+        const sectionMeta = extractSectionMetadata(sectionContent);
+        sections.push({
+          ...sectionMeta,
+          content: sectionContent, // Full content including headers
+        });
+      }
+
+      // Start new content section
+      currentSection = line;
+      currentSectionContent = [line];
+    } else if (currentSection) {
+      // Continue collecting content for current section
+      currentSectionContent.push(line);
+    } else if (inDocumentHeader) {
+      documentContent.push(line);
+    }
+  }
+
+  // Save the last section
+  if (currentSection) {
+    const sectionContent = currentSectionContent.join("\n");
+    const sectionMeta = extractSectionMetadata(sectionContent);
+    sections.push({
+      ...sectionMeta,
+      content: sectionContent, // Full content including headers
+    });
+  }
+
+  // Now collect index sections (all levels from 2 to parseLevel-1)
+  // These should be shown as navigation/structure but not full content
+  const indexSections: Array<{
+    metadata: ParsedAsciiDoc["sections"][0]["metadata"];
+    content: string;
+    title: string;
+    level: number;
+  }> = [];
+
+  for (let level = 2; level < parseLevel; level++) {
+    const levelPattern = new RegExp(`^${"=".repeat(level)}\\s+(.+)$`, "gm");
+    const matches = content.matchAll(levelPattern);
+
+    for (const match of matches) {
+      const title = match[1].trim();
+      indexSections.push({
+        metadata: { title },
+        content: `${"=".repeat(level)} ${title}`, // Just the header line for index sections
+        title,
+        level,
+      });
+    }
+  }
+
+  // Add actual level to content sections based on their content
+  const contentSectionsWithLevel = sections.map((s) => ({
+    ...s,
+    level: getSectionLevel(s.content),
+  }));
+
+  // Combine index sections and content sections
+  // Sort by position in original content to maintain order
+  const allSections = [...indexSections, ...contentSectionsWithLevel];
+
+  // Sort sections by their appearance in the original content
+  allSections.sort((a, b) => {
+    const posA = content.indexOf(a.content.split("\n")[0]);
+    const posB = content.indexOf(b.content.split("\n")[0]);
+    return posA - posB;
+  });
+
+  const docContent = documentContent.join("\n");
+  return {
+    metadata: docMetadata,
+    content: docContent,
+    title: docMetadata.title || "",
+    sections: allSections,
+  };
+}
+
+/**
+ * Generates Nostr events from parsed AsciiDoc with proper hierarchical structure
+ * Based on docreference.md specifications
+ */
+export function generateNostrEvents(
+  parsed: ParsedAsciiDoc,
+  parseLevel: number = 2,
+  pubkey?: string,
+  maxDepth: number = 6,
+): {
+  indexEvent?: any;
+  contentEvents: any[];
+} {
+  const allEvents: any[] = [];
+  const actualPubkey = pubkey || "pubkey";
+
+  // Helper function to generate section ID
+  const generateSectionId = (title: string): string => {
+    return title
+      .toLowerCase()
+      .replace(/[^\p{L}\p{N}]/gu, "-")
+      .replace(/-+/g, "-")
+      .replace(/^-|-$/g, "");
+  };
+
+  // Build hierarchical tree structure
+  interface TreeNode {
+    section: {
+      metadata: any;
+      content: string;
+      title: string;
+    };
+    level: number;
+    sectionId: string;
+    tags: [string, string][];
+    children: TreeNode[];
+    parent?: TreeNode;
+  }
+
+  // Convert flat sections to tree structure
+  const buildTree = (): TreeNode[] => {
+    const roots: TreeNode[] = [];
+    const stack: TreeNode[] = [];
+
+    for (const section of parsed.sections) {
+      const level = getSectionLevel(section.content);
+      const sectionId = generateSectionId(section.title);
+      const tags = parseSimpleAttributes(section.content);
+
+      const node: TreeNode = {
+        section,
+        level,
+        sectionId,
+        tags,
+        children: [],
+      };
+
+      // Find the correct parent based on header hierarchy
+      while (stack.length > 0 && stack[stack.length - 1].level >= level) {
+        stack.pop();
+      }
+
+      if (stack.length === 0) {
+        // This is a root level section
+        roots.push(node);
+      } else {
+        // This is a child of the last item in stack
+        const parent = stack[stack.length - 1];
+        parent.children.push(node);
+        node.parent = parent;
+      }
+
+      stack.push(node);
+    }
+
+    return roots;
+  };
+
+  const tree = buildTree();
+
+  // Recursively create events from tree
+  const createEventsFromNode = (node: TreeNode): void => {
+    const { section, level, sectionId, tags, children } = node;
+
+    // Determine if this node should become an index
+    const hasChildrenAtTargetLevel = children.some(
+      (child) => child.level === parseLevel,
+    );
+    const shouldBeIndex =
+      level < parseLevel &&
+      (hasChildrenAtTargetLevel ||
+        children.some((child) => child.level <= parseLevel));
+
+    if (shouldBeIndex) {
+      // Create content event for intro text (30041)
+      const introContent = extractIntroContent(section.content, level);
+      if (introContent.trim()) {
+        const contentEvent = {
+          id: "",
+          pubkey: "",
+          created_at: Math.floor(Date.now() / 1000),
+          kind: 30041,
+          tags: [
+            ["d", `${sectionId}-content`],
+            ["title", section.title],
+            ...tags,
+          ],
+          content: introContent,
+          sig: "",
+        };
+        allEvents.push(contentEvent);
+      }
+
+      // Create index event (30040)
+      const childATags: string[][] = [];
+
+      // Add a-tag for intro content if it exists
+      if (introContent.trim()) {
+        childATags.push([
+          "a",
+          `30041:${actualPubkey}:${sectionId}-content`,
+          "",
+          "",
+        ]);
+      }
+
+      // Add a-tags for direct children
+      for (const child of children) {
+        const childHasSubChildren = child.children.some(
+          (grandchild) => grandchild.level <= parseLevel,
+        );
+        const childShouldBeIndex =
+          child.level < parseLevel && childHasSubChildren;
+        const childKind = childShouldBeIndex ? 30040 : 30041;
+        childATags.push([
+          "a",
+          `${childKind}:${actualPubkey}:${child.sectionId}`,
+          "",
+          "",
+        ]);
+      }
+
+      const indexEvent = {
+        id: "",
+        pubkey: "",
+        created_at: Math.floor(Date.now() / 1000),
+        kind: 30040,
+        tags: [
+          ["d", sectionId],
+          ["title", section.title],
+          ...tags,
+          ...childATags,
+        ],
+        content: "",
+        sig: "",
+      };
+      allEvents.push(indexEvent);
+    } else {
+      // Create regular content event (30041)
+      const contentEvent = {
+        id: "",
+        pubkey: "",
+        created_at: Math.floor(Date.now() / 1000),
+        kind: 30041,
+        tags: [["d", sectionId], ["title", section.title], ...tags],
+        content: section.content,
+        sig: "",
+      };
+      allEvents.push(contentEvent);
+    }
+
+    // Recursively process children
+    for (const child of children) {
+      createEventsFromNode(child);
+    }
+  };
+
+  // Process all root level sections
+  for (const rootNode of tree) {
+    createEventsFromNode(rootNode);
+  }
+
+  // Create main document index if we have a document title (article format)
+  if (parsed.title && parsed.title.trim() !== "") {
+    const documentId = generateSectionId(parsed.title);
+    const documentTags = parseSimpleAttributes(parsed.content);
+
+    // Create a-tags for all root level sections (level 2)
+    const mainIndexATags = tree.map((rootNode) => {
+      const hasSubChildren = rootNode.children.some(
+        (child) => child.level <= parseLevel,
+      );
+      const shouldBeIndex = rootNode.level < parseLevel && hasSubChildren;
+      const kind = shouldBeIndex ? 30040 : 30041;
+      return ["a", `${kind}:${actualPubkey}:${rootNode.sectionId}`, "", ""];
+    });
+
+    console.log("Debug: Root sections found:", tree.length);
+    console.log("Debug: Main index a-tags:", mainIndexATags);
+
+    const mainIndexEvent = {
+      id: "",
+      pubkey: "",
+      created_at: Math.floor(Date.now() / 1000),
+      kind: 30040,
+      tags: [
+        ["d", documentId],
+        ["title", parsed.title],
+        ...documentTags,
+        ...mainIndexATags,
+      ],
+      content: "",
+      sig: "",
+    };
+
+    return {
+      indexEvent: mainIndexEvent,
+      contentEvents: allEvents,
+    };
+  }
+
+  // For scattered notes, return only content events
+  return {
+    contentEvents: allEvents,
+  };
+}
+
+/**
+ * Detects content type for smart publishing
+ */
+export function detectContentType(
+  content: string,
+): "article" | "scattered-notes" | "none" {
+  const hasDocTitle =
+    content.trim().startsWith("=") && !content.trim().startsWith("==");
+  const hasSections = content.includes("==");
+
+  if (hasDocTitle) {
+    return "article";
+  } else if (hasSections) {
+    return "scattered-notes";
+  } else {
+    return "none";
+  }
+}