add repitition test

3 months ago · 23bd727ccd
3 changed files with 186 additions and 3 deletions
--- a/src/utils/report-generator.ts
+++ b/src/utils/report-generator.ts
@ -340,7 +340,7 @@ export function generateHTMLReport(data: ReportData): string {
      <div id="md-rendered" class="tab-content">
        <h3>Rendered HTML Output</h3>
        <div class="rendered-output">
-          ${markdown.result.content}
+          ${cleanHtmlContent(markdown.result.content)}
        </div>
        <details style="margin-top: 15px;">
          <summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
@ -476,7 +476,7 @@ export function generateHTMLReport(data: ReportData): string {
      <div id="ad-rendered" class="tab-content">
        <h3>Rendered HTML Output</h3>
        <div class="rendered-output">
-          ${asciidoc.result.content}
+          ${cleanHtmlContent(asciidoc.result.content)}
        </div>
        <details style="margin-top: 15px;">
          <summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
@ -571,6 +571,71 @@ export function generateHTMLReport(data: ReportData): string {
 </html>`;
 }
 /**
 * Clean HTML content to extract only the body content
 * Removes full HTML document structure if present
 * Prevents infinite loops by ensuring we only extract once and handle nested structures
 */
 function cleanHtmlContent(html: string): string {
  if (!html || typeof html !== 'string') {
    return '';
  }
  let cleaned = html.trim();
  // Count occurrences to detect nested structures
  const htmlTagCount = (cleaned.match(/<html[^>]*>/gi) || []).length;
  const bodyTagCount = (cleaned.match(/<body[^>]*>/gi) || []).length;
  const bodyCloseCount = (cleaned.match(/<\/body>/gi) || []).length;
  // If we have multiple body tags, there might be nested structures
  // Extract only the outermost body content
  if (bodyTagCount > 0 && bodyCloseCount > 0) {
    // Find the first <body> tag
    const firstBodyIndex = cleaned.indexOf('<body');
    if (firstBodyIndex !== -1) {
      // Find the opening > of the first body tag
      const bodyTagEnd = cleaned.indexOf('>', firstBodyIndex);
      if (bodyTagEnd !== -1) {
        const bodyStart = bodyTagEnd + 1;
        // Find the last </body> tag (to handle nested structures)
        const bodyEnd = cleaned.lastIndexOf('</body>');
        if (bodyEnd > bodyStart) {
          cleaned = cleaned.substring(bodyStart, bodyEnd).trim();
          // Recursively clean if there are still nested structures
          // But limit recursion to prevent infinite loops
          const remainingBodyTags = (cleaned.match(/<body[^>]*>/gi) || []).length;
          if (remainingBodyTags > 0 && remainingBodyTags < bodyTagCount) {
            // There are still nested body tags, clean again but only once more
            cleaned = cleaned.replace(/<body[^>]*>/gi, '');
            cleaned = cleaned.replace(/<\/body>/gi, '');
          }
        }
      }
    }
  }
  // Remove any remaining DOCTYPE, html, head, or body tags that might be left
  // Do this in a way that doesn't create nested matches
  let previousLength = 0;
  let iterations = 0;
  while (iterations < 10 && cleaned.length !== previousLength) {
    previousLength = cleaned.length;
    cleaned = cleaned.replace(/<!DOCTYPE[^>]*>/gi, '');
    cleaned = cleaned.replace(/<html[^>]*>/gi, '');
    cleaned = cleaned.replace(/<\/html>/gi, '');
    cleaned = cleaned.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '');
    cleaned = cleaned.replace(/<body[^>]*>/gi, '');
    cleaned = cleaned.replace(/<\/body>/gi, '');
    cleaned = cleaned.trim();
    iterations++;
  }
  return cleaned;
 }
 /**
 * Escape HTML special characters
 */
--- a/test-parser-report.test.ts
+++ b/test-parser-report.test.ts
@ -264,5 +264,123 @@ describe('Parser Test Report', () => {
    if (markdownResult.hasMusicalNotation) {
      expect(htmlReport).toMatch(/<div class="number">Yes<\/div>.*Has Music/i);
    }
    // ============================================
    // Test for Content Repetition (Doom Loop Fix)
    // ============================================
    // Extract rendered output sections from the HTML report
    const renderedOutputRegex = /<div class="rendered-output">([\s\S]*?)<\/div>/gi;
    const renderedOutputs: string[] = [];
    let match;
    while ((match = renderedOutputRegex.exec(htmlReport)) !== null) {
      renderedOutputs.push(match[1]);
    }
    // Test that we have rendered output sections
    expect(renderedOutputs.length).toBeGreaterThan(0);
    // Test each rendered output section for content repetition
    renderedOutputs.forEach((output, index) => {
      // Check for specific content that should only appear once
      const testPhrases = [
        '# Markdown Test Document',
        '## Bullet list',
        'This is a test unordered list with mixed bullets:',
        '## Headers',
        '## Media and Links',
        '### Nostr address',
        '## Tables',
        '## Code blocks',
        '## LateX',
      ];
      testPhrases.forEach(phrase => {
        // Count occurrences of the phrase in this output section
        const occurrences = (output.match(new RegExp(escapeRegex(phrase), 'gi')) || []).length;
        // Each phrase should appear at most once (or a few times if it's in different contexts)
        // But if it appears many times, that indicates a repetition loop
        if (occurrences > 5) {
          throw new Error(
            `Content repetition detected in rendered output section ${index + 1}: ` +
            `"${phrase}" appears ${occurrences} times (expected ≤5). ` +
            `This indicates a doom-loop in content generation.`
          );
        }
      });
      // Check for duplicate document structure
      // If the entire document structure repeats, we'll see multiple instances of key sections
      const sectionHeaders = output.match(/##\s+[^\n]+/g) || [];
      const uniqueHeaders = new Set(sectionHeaders.map(h => h.trim()));
      // If we have many more headers than unique ones, content is repeating
      if (sectionHeaders.length > uniqueHeaders.size * 2) {
        throw new Error(
          `Content repetition detected in rendered output section ${index + 1}: ` +
          `Found ${sectionHeaders.length} section headers but only ${uniqueHeaders.size} unique ones. ` +
          `This indicates the entire document is repeating.`
        );
      }
      // Check for repeated code block placeholders (should only appear once per code block)
      const codeBlockPlaceholders: string[] = (output.match(/__CODEBLOCK_\d+__/g) || []);
      const uniquePlaceholders = new Set(codeBlockPlaceholders);
      // Each placeholder should appear only once
      if (codeBlockPlaceholders.length !== uniquePlaceholders.size) {
        const duplicates = codeBlockPlaceholders.filter((p, i) => codeBlockPlaceholders.indexOf(p) !== i);
        throw new Error(
          `Content repetition detected in rendered output section ${index + 1}: ` +
          `Found duplicate code block placeholders: ${Array.from(new Set(duplicates)).join(', ')}. ` +
          `Each placeholder should appear only once.`
        );
      }
      // Check overall content length - if it's unreasonably long, content might be repeating
      // A typical test document should be under 50KB in the rendered output
      if (output.length > 100000) {
        console.warn(
          `⚠️  Rendered output section ${index + 1} is very long (${output.length} chars). ` +
          `This might indicate content repetition.`
        );
      }
    });
    // Test that the markdown content appears only once in the markdown rendered section
    const markdownRenderedMatch = htmlReport.match(
      /<div id="md-rendered"[\s\S]*?<div class="rendered-output">([\s\S]*?)<\/div>/
    );
    if (markdownRenderedMatch) {
      const markdownRendered = markdownRenderedMatch[1];
      // Count how many times the document title appears
      const titleCount = (markdownRendered.match(/# Markdown Test Document/gi) || []).length;
      expect(titleCount).toBeLessThanOrEqual(1);
      // Count how many times a unique section appears
      const uniqueSection = 'Ordered list that is wrongly numbered:';
      const uniqueSectionCount = (markdownRendered.match(new RegExp(escapeRegex(uniqueSection), 'gi')) || []).length;
      expect(uniqueSectionCount).toBeLessThanOrEqual(1);
    }
    // Test that the asciidoc content appears only once in the asciidoc rendered section
    const asciidocRenderedMatch = htmlReport.match(
      /<div id="ad-rendered"[\s\S]*?<div class="rendered-output">([\s\S]*?)<\/div>/
    );
    if (asciidocRenderedMatch) {
      const asciidocRendered = asciidocRenderedMatch[1];
      // Count how many times the document title appears
      const titleCount = (asciidocRendered.match(/== Bullet list/gi) || []).length;
      expect(titleCount).toBeLessThanOrEqual(1);
    }
    console.log('✅ Content repetition check passed - no doom-loop detected');
  });
 });
 /**
 * Escape special regex characters in a string
 */
 function escapeRegex(str: string): string {
  return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
 }
--- a/test-report.html
+++ b/test-report.html
@ -247,7 +247,7 @@
 <body>
  <div class="container">
    <h1>GC Parser Test Report</h1>
-    <p class="subtitle">Generated: 4.3.2026, 13:04:08</p>
+    <p class="subtitle">Generated: 4.3.2026, 13:12:35</p>
    <!-- Markdown Section -->
    <div class="section">