add asciidoc_parser

8 months ago · 846147bef3
1 changed files with 577 additions and 0 deletions
--- a/src/lib/utils/asciidoc_parser.ts
+++ b/src/lib/utils/asciidoc_parser.ts
@ -0,0 +1,577 @@
 /**
 * AsciiDoc Content Parsing Service
 *
 * Handles parsing AsciiDoc content into hierarchical structures for publication.
 * Separated from metadata extraction to maintain single responsibility principle.
 */
 // @ts-ignore
 import Processor from "asciidoctor";
 import type { Document } from "asciidoctor";
 import {
  parseSimpleAttributes,
  extractDocumentMetadata,
  extractSectionMetadata,
 } from "./asciidoc_metadata.ts";
 export interface ParsedAsciiDoc {
  metadata: {
    title?: string;
    authors?: string[];
    version?: string;
    edition?: string;
    publicationDate?: string;
    publisher?: string;
    summary?: string;
    coverImage?: string;
    isbn?: string;
    tags?: string[];
    source?: string;
    publishedBy?: string;
    type?: string;
    autoUpdate?: "yes" | "ask" | "no";
    customAttributes?: Record<string, string>;
  };
  content: string;
  title: string;
  sections: Array<{
    metadata: {
      title?: string;
      authors?: string[];
      version?: string;
      edition?: string;
      publicationDate?: string;
      publisher?: string;
      summary?: string;
      coverImage?: string;
      isbn?: string;
      tags?: string[];
      source?: string;
      publishedBy?: string;
      type?: string;
      autoUpdate?: "yes" | "ask" | "no";
      customAttributes?: Record<string, string>;
    };
    content: string;
    title: string;
  }>;
 }
 /**
 * Creates an Asciidoctor processor instance
 */
 function createProcessor() {
  return Processor();
 }
 /**
 * Helper function to determine the header level of a section
 */
 function getSectionLevel(sectionContent: string): number {
  const lines = sectionContent.split(/\r?\n/);
  for (const line of lines) {
    const match = line.match(/^(=+)\s+/);
    if (match) {
      return match[1].length;
    }
  }
  return 0;
 }
 /**
 * Helper function to extract just the intro content (before first subsection)
 */
 function extractIntroContent(
  sectionContent: string,
  currentLevel: number,
 ): string {
  const lines = sectionContent.split(/\r?\n/);
  const introLines: string[] = [];
  let foundHeader = false;
  for (const line of lines) {
    const headerMatch = line.match(/^(=+)\s+/);
    if (headerMatch) {
      const level = headerMatch[1].length;
      if (level === currentLevel && !foundHeader) {
        // This is the section header itself
        foundHeader = true;
        continue; // Skip the header line itself for intro content
      } else if (level > currentLevel) {
        // This is a subsection, stop collecting intro content
        break;
      }
    } else if (foundHeader) {
      // This is intro content after the header
      introLines.push(line);
    }
  }
  return introLines.join("\n").trim();
 }
 /**
 * Parses AsciiDoc content into sections with metadata
 */
 export function parseAsciiDocWithMetadata(content: string): ParsedAsciiDoc {
  const asciidoctor = createProcessor();
  const document = asciidoctor.load(content, { standalone: false }) as Document;
  const { metadata: docMetadata } = extractDocumentMetadata(content);
  // Parse the original content to find section attributes
  const lines = content.split(/\r?\n/);
  const sectionsWithMetadata: Array<{
    metadata: ParsedAsciiDoc["sections"][0]["metadata"];
    content: string;
    title: string;
  }> = [];
  let currentSection: string | null = null;
  let currentSectionContent: string[] = [];
  for (const line of lines) {
    if (line.match(/^==\s+/)) {
      // Save previous section if exists
      if (currentSection) {
        const sectionContent = currentSectionContent.join("\n");
        sectionsWithMetadata.push(extractSectionMetadata(sectionContent));
      }
      // Start new section
      currentSection = line;
      currentSectionContent = [line];
    } else if (currentSection) {
      currentSectionContent.push(line);
    }
  }
  // Save the last section
  if (currentSection) {
    const sectionContent = currentSectionContent.join("\n");
    sectionsWithMetadata.push(extractSectionMetadata(sectionContent));
  }
  return {
    metadata: docMetadata,
    content: document.getSource(),
    title: docMetadata.title || "",
    sections: sectionsWithMetadata,
  };
 }
 /**
 * Iterative AsciiDoc parsing based on specified level
 * Level 2: Only == sections become content events (containing all subsections)
 * Level 3: == sections become indices + content events, === sections become content events
 * Level 4: === sections become indices + content events, ==== sections become content events, etc.
 */
 export function parseAsciiDocIterative(
  content: string,
  parseLevel: number = 2,
 ): ParsedAsciiDoc {
  const asciidoctor = createProcessor();
  const document = asciidoctor.load(content, { standalone: false }) as Document;
  // Extract document metadata using the metadata extraction functions
  const { metadata: docMetadata } = extractDocumentMetadata(content);
  const lines = content.split(/\r?\n/);
  const sections: Array<{
    metadata: ParsedAsciiDoc["sections"][0]["metadata"];
    content: string;
    title: string;
  }> = [];
  if (parseLevel === 2) {
    // Level 2: Only == sections become events
    const level2Pattern = /^==\s+/;
    let currentSection: string | null = null;
    let currentSectionContent: string[] = [];
    let documentContent: string[] = [];
    let inDocumentHeader = true;
    for (const line of lines) {
      if (line.match(level2Pattern)) {
        inDocumentHeader = false;
        // Save previous section if exists
        if (currentSection) {
          const sectionContent = currentSectionContent.join("\n");
          const sectionMeta = extractSectionMetadata(sectionContent);
          // For level 2, preserve the full content including the header
          sections.push({
            ...sectionMeta,
            content: sectionContent, // Use full content, not stripped
          });
        }
        // Start new section
        currentSection = line;
        currentSectionContent = [line];
      } else if (currentSection) {
        currentSectionContent.push(line);
      } else if (inDocumentHeader) {
        documentContent.push(line);
      }
    }
    // Save the last section
    if (currentSection) {
      const sectionContent = currentSectionContent.join("\n");
      const sectionMeta = extractSectionMetadata(sectionContent);
      // For level 2, preserve the full content including the header
      sections.push({
        ...sectionMeta,
        content: sectionContent, // Use full content, not stripped
      });
    }
    const docContent = documentContent.join("\n");
    return {
      metadata: docMetadata,
      content: docContent,
      title: docMetadata.title || "",
      sections: sections,
    };
  }
  // Level 3+: Parse hierarchically
  // All levels from 2 to parseLevel-1 are indices (title only)
  // Level parseLevel are content sections (full content)
  // First, collect all sections at the content level (parseLevel)
  const contentLevelPattern = new RegExp(`^${"=".repeat(parseLevel)}\\s+`);
  let currentSection: string | null = null;
  let currentSectionContent: string[] = [];
  let documentContent: string[] = [];
  let inDocumentHeader = true;
  for (const line of lines) {
    if (line.match(contentLevelPattern)) {
      inDocumentHeader = false;
      // Save previous section if exists
      if (currentSection) {
        const sectionContent = currentSectionContent.join("\n");
        const sectionMeta = extractSectionMetadata(sectionContent);
        sections.push({
          ...sectionMeta,
          content: sectionContent, // Full content including headers
        });
      }
      // Start new content section
      currentSection = line;
      currentSectionContent = [line];
    } else if (currentSection) {
      // Continue collecting content for current section
      currentSectionContent.push(line);
    } else if (inDocumentHeader) {
      documentContent.push(line);
    }
  }
  // Save the last section
  if (currentSection) {
    const sectionContent = currentSectionContent.join("\n");
    const sectionMeta = extractSectionMetadata(sectionContent);
    sections.push({
      ...sectionMeta,
      content: sectionContent, // Full content including headers
    });
  }
  // Now collect index sections (all levels from 2 to parseLevel-1)
  // These should be shown as navigation/structure but not full content
  const indexSections: Array<{
    metadata: ParsedAsciiDoc["sections"][0]["metadata"];
    content: string;
    title: string;
    level: number;
  }> = [];
  for (let level = 2; level < parseLevel; level++) {
    const levelPattern = new RegExp(`^${"=".repeat(level)}\\s+(.+)$`, "gm");
    const matches = content.matchAll(levelPattern);
    for (const match of matches) {
      const title = match[1].trim();
      indexSections.push({
        metadata: { title },
        content: `${"=".repeat(level)} ${title}`, // Just the header line for index sections
        title,
        level,
      });
    }
  }
  // Add actual level to content sections based on their content
  const contentSectionsWithLevel = sections.map((s) => ({
    ...s,
    level: getSectionLevel(s.content),
  }));
  // Combine index sections and content sections
  // Sort by position in original content to maintain order
  const allSections = [...indexSections, ...contentSectionsWithLevel];
  // Sort sections by their appearance in the original content
  allSections.sort((a, b) => {
    const posA = content.indexOf(a.content.split("\n")[0]);
    const posB = content.indexOf(b.content.split("\n")[0]);
    return posA - posB;
  });
  const docContent = documentContent.join("\n");
  return {
    metadata: docMetadata,
    content: docContent,
    title: docMetadata.title || "",
    sections: allSections,
  };
 }
 /**
 * Generates Nostr events from parsed AsciiDoc with proper hierarchical structure
 * Based on docreference.md specifications
 */
 export function generateNostrEvents(
  parsed: ParsedAsciiDoc,
  parseLevel: number = 2,
  pubkey?: string,
  maxDepth: number = 6,
 ): {
  indexEvent?: any;
  contentEvents: any[];
 } {
  const allEvents: any[] = [];
  const actualPubkey = pubkey || "pubkey";
  // Helper function to generate section ID
  const generateSectionId = (title: string): string => {
    return title
      .toLowerCase()
      .replace(/[^\p{L}\p{N}]/gu, "-")
      .replace(/-+/g, "-")
      .replace(/^-|-$/g, "");
  };
  // Build hierarchical tree structure
  interface TreeNode {
    section: {
      metadata: any;
      content: string;
      title: string;
    };
    level: number;
    sectionId: string;
    tags: [string, string][];
    children: TreeNode[];
    parent?: TreeNode;
  }
  // Convert flat sections to tree structure
  const buildTree = (): TreeNode[] => {
    const roots: TreeNode[] = [];
    const stack: TreeNode[] = [];
    for (const section of parsed.sections) {
      const level = getSectionLevel(section.content);
      const sectionId = generateSectionId(section.title);
      const tags = parseSimpleAttributes(section.content);
      const node: TreeNode = {
        section,
        level,
        sectionId,
        tags,
        children: [],
      };
      // Find the correct parent based on header hierarchy
      while (stack.length > 0 && stack[stack.length - 1].level >= level) {
        stack.pop();
      }
      if (stack.length === 0) {
        // This is a root level section
        roots.push(node);
      } else {
        // This is a child of the last item in stack
        const parent = stack[stack.length - 1];
        parent.children.push(node);
        node.parent = parent;
      }
      stack.push(node);
    }
    return roots;
  };
  const tree = buildTree();
  // Recursively create events from tree
  const createEventsFromNode = (node: TreeNode): void => {
    const { section, level, sectionId, tags, children } = node;
    // Determine if this node should become an index
    const hasChildrenAtTargetLevel = children.some(
      (child) => child.level === parseLevel,
    );
    const shouldBeIndex =
      level < parseLevel &&
      (hasChildrenAtTargetLevel ||
        children.some((child) => child.level <= parseLevel));
    if (shouldBeIndex) {
      // Create content event for intro text (30041)
      const introContent = extractIntroContent(section.content, level);
      if (introContent.trim()) {
        const contentEvent = {
          id: "",
          pubkey: "",
          created_at: Math.floor(Date.now() / 1000),
          kind: 30041,
          tags: [
            ["d", `${sectionId}-content`],
            ["title", section.title],
            ...tags,
          ],
          content: introContent,
          sig: "",
        };
        allEvents.push(contentEvent);
      }
      // Create index event (30040)
      const childATags: string[][] = [];
      // Add a-tag for intro content if it exists
      if (introContent.trim()) {
        childATags.push([
          "a",
          `30041:${actualPubkey}:${sectionId}-content`,
          "",
          "",
        ]);
      }
      // Add a-tags for direct children
      for (const child of children) {
        const childHasSubChildren = child.children.some(
          (grandchild) => grandchild.level <= parseLevel,
        );
        const childShouldBeIndex =
          child.level < parseLevel && childHasSubChildren;
        const childKind = childShouldBeIndex ? 30040 : 30041;
        childATags.push([
          "a",
          `${childKind}:${actualPubkey}:${child.sectionId}`,
          "",
          "",
        ]);
      }
      const indexEvent = {
        id: "",
        pubkey: "",
        created_at: Math.floor(Date.now() / 1000),
        kind: 30040,
        tags: [
          ["d", sectionId],
          ["title", section.title],
          ...tags,
          ...childATags,
        ],
        content: "",
        sig: "",
      };
      allEvents.push(indexEvent);
    } else {
      // Create regular content event (30041)
      const contentEvent = {
        id: "",
        pubkey: "",
        created_at: Math.floor(Date.now() / 1000),
        kind: 30041,
        tags: [["d", sectionId], ["title", section.title], ...tags],
        content: section.content,
        sig: "",
      };
      allEvents.push(contentEvent);
    }
    // Recursively process children
    for (const child of children) {
      createEventsFromNode(child);
    }
  };
  // Process all root level sections
  for (const rootNode of tree) {
    createEventsFromNode(rootNode);
  }
  // Create main document index if we have a document title (article format)
  if (parsed.title && parsed.title.trim() !== "") {
    const documentId = generateSectionId(parsed.title);
    const documentTags = parseSimpleAttributes(parsed.content);
    // Create a-tags for all root level sections (level 2)
    const mainIndexATags = tree.map((rootNode) => {
      const hasSubChildren = rootNode.children.some(
        (child) => child.level <= parseLevel,
      );
      const shouldBeIndex = rootNode.level < parseLevel && hasSubChildren;
      const kind = shouldBeIndex ? 30040 : 30041;
      return ["a", `${kind}:${actualPubkey}:${rootNode.sectionId}`, "", ""];
    });
    console.log("Debug: Root sections found:", tree.length);
    console.log("Debug: Main index a-tags:", mainIndexATags);
    const mainIndexEvent = {
      id: "",
      pubkey: "",
      created_at: Math.floor(Date.now() / 1000),
      kind: 30040,
      tags: [
        ["d", documentId],
        ["title", parsed.title],
        ...documentTags,
        ...mainIndexATags,
      ],
      content: "",
      sig: "",
    };
    return {
      indexEvent: mainIndexEvent,
      contentEvents: allEvents,
    };
  }
  // For scattered notes, return only content events
  return {
    contentEvents: allEvents,
  };
 }
 /**
 * Detects content type for smart publishing
 */
 export function detectContentType(
  content: string,
 ): "article" | "scattered-notes" | "none" {
  const hasDocTitle =
    content.trim().startsWith("=") && !content.trim().startsWith("==");
  const hasSections = content.includes("==");
  if (hasDocTitle) {
    return "article";
  } else if (hasSections) {
    return "scattered-notes";
  } else {
    return "none";
  }
 }