diff --git a/src/utils/report-generator.ts b/src/utils/report-generator.ts index 9fce9a9..52c7530 100644 --- a/src/utils/report-generator.ts +++ b/src/utils/report-generator.ts @@ -340,7 +340,7 @@ export function generateHTMLReport(data: ReportData): string {

Rendered HTML Output

- ${markdown.result.content} + ${cleanHtmlContent(markdown.result.content)}
View Raw HTML @@ -476,7 +476,7 @@ export function generateHTMLReport(data: ReportData): string {

Rendered HTML Output

- ${asciidoc.result.content} + ${cleanHtmlContent(asciidoc.result.content)}
View Raw HTML @@ -571,6 +571,71 @@ export function generateHTMLReport(data: ReportData): string { `; } +/** + * Clean HTML content to extract only the body content + * Removes full HTML document structure if present + * Prevents infinite loops by ensuring we only extract once and handle nested structures + */ +function cleanHtmlContent(html: string): string { + if (!html || typeof html !== 'string') { + return ''; + } + + let cleaned = html.trim(); + + // Count occurrences to detect nested structures + const htmlTagCount = (cleaned.match(/]*>/gi) || []).length; + const bodyTagCount = (cleaned.match(/]*>/gi) || []).length; + const bodyCloseCount = (cleaned.match(/<\/body>/gi) || []).length; + + // If we have multiple body tags, there might be nested structures + // Extract only the outermost body content + if (bodyTagCount > 0 && bodyCloseCount > 0) { + // Find the first tag + const firstBodyIndex = cleaned.indexOf(' of the first body tag + const bodyTagEnd = cleaned.indexOf('>', firstBodyIndex); + if (bodyTagEnd !== -1) { + const bodyStart = bodyTagEnd + 1; + // Find the last tag (to handle nested structures) + const bodyEnd = cleaned.lastIndexOf(''); + + if (bodyEnd > bodyStart) { + cleaned = cleaned.substring(bodyStart, bodyEnd).trim(); + + // Recursively clean if there are still nested structures + // But limit recursion to prevent infinite loops + const remainingBodyTags = (cleaned.match(/]*>/gi) || []).length; + if (remainingBodyTags > 0 && remainingBodyTags < bodyTagCount) { + // There are still nested body tags, clean again but only once more + cleaned = cleaned.replace(/]*>/gi, ''); + cleaned = cleaned.replace(/<\/body>/gi, ''); + } + } + } + } + } + + // Remove any remaining DOCTYPE, html, head, or body tags that might be left + // Do this in a way that doesn't create nested matches + let previousLength = 0; + let iterations = 0; + while (iterations < 10 && cleaned.length !== previousLength) { + previousLength = cleaned.length; + cleaned = cleaned.replace(/]*>/gi, ''); + cleaned = cleaned.replace(/]*>/gi, ''); + cleaned = cleaned.replace(/<\/html>/gi, ''); + cleaned = cleaned.replace(/]*>[\s\S]*?<\/head>/gi, ''); + cleaned = cleaned.replace(/]*>/gi, ''); + cleaned = cleaned.replace(/<\/body>/gi, ''); + cleaned = cleaned.trim(); + iterations++; + } + + return cleaned; +} + /** * Escape HTML special characters */ diff --git a/test-parser-report.test.ts b/test-parser-report.test.ts index 1dd87a5..69ebcb3 100644 --- a/test-parser-report.test.ts +++ b/test-parser-report.test.ts @@ -264,5 +264,123 @@ describe('Parser Test Report', () => { if (markdownResult.hasMusicalNotation) { expect(htmlReport).toMatch(/
Yes<\/div>.*Has Music/i); } + + // ============================================ + // Test for Content Repetition (Doom Loop Fix) + // ============================================ + // Extract rendered output sections from the HTML report + const renderedOutputRegex = /
([\s\S]*?)<\/div>/gi; + const renderedOutputs: string[] = []; + let match; + while ((match = renderedOutputRegex.exec(htmlReport)) !== null) { + renderedOutputs.push(match[1]); + } + + // Test that we have rendered output sections + expect(renderedOutputs.length).toBeGreaterThan(0); + + // Test each rendered output section for content repetition + renderedOutputs.forEach((output, index) => { + // Check for specific content that should only appear once + const testPhrases = [ + '# Markdown Test Document', + '## Bullet list', + 'This is a test unordered list with mixed bullets:', + '## Headers', + '## Media and Links', + '### Nostr address', + '## Tables', + '## Code blocks', + '## LateX', + ]; + + testPhrases.forEach(phrase => { + // Count occurrences of the phrase in this output section + const occurrences = (output.match(new RegExp(escapeRegex(phrase), 'gi')) || []).length; + + // Each phrase should appear at most once (or a few times if it's in different contexts) + // But if it appears many times, that indicates a repetition loop + if (occurrences > 5) { + throw new Error( + `Content repetition detected in rendered output section ${index + 1}: ` + + `"${phrase}" appears ${occurrences} times (expected ≤5). ` + + `This indicates a doom-loop in content generation.` + ); + } + }); + + // Check for duplicate document structure + // If the entire document structure repeats, we'll see multiple instances of key sections + const sectionHeaders = output.match(/##\s+[^\n]+/g) || []; + const uniqueHeaders = new Set(sectionHeaders.map(h => h.trim())); + + // If we have many more headers than unique ones, content is repeating + if (sectionHeaders.length > uniqueHeaders.size * 2) { + throw new Error( + `Content repetition detected in rendered output section ${index + 1}: ` + + `Found ${sectionHeaders.length} section headers but only ${uniqueHeaders.size} unique ones. ` + + `This indicates the entire document is repeating.` + ); + } + + // Check for repeated code block placeholders (should only appear once per code block) + const codeBlockPlaceholders: string[] = (output.match(/__CODEBLOCK_\d+__/g) || []); + const uniquePlaceholders = new Set(codeBlockPlaceholders); + + // Each placeholder should appear only once + if (codeBlockPlaceholders.length !== uniquePlaceholders.size) { + const duplicates = codeBlockPlaceholders.filter((p, i) => codeBlockPlaceholders.indexOf(p) !== i); + throw new Error( + `Content repetition detected in rendered output section ${index + 1}: ` + + `Found duplicate code block placeholders: ${Array.from(new Set(duplicates)).join(', ')}. ` + + `Each placeholder should appear only once.` + ); + } + + // Check overall content length - if it's unreasonably long, content might be repeating + // A typical test document should be under 50KB in the rendered output + if (output.length > 100000) { + console.warn( + `⚠️ Rendered output section ${index + 1} is very long (${output.length} chars). ` + + `This might indicate content repetition.` + ); + } + }); + + // Test that the markdown content appears only once in the markdown rendered section + const markdownRenderedMatch = htmlReport.match( + /
([\s\S]*?)<\/div>/ + ); + if (markdownRenderedMatch) { + const markdownRendered = markdownRenderedMatch[1]; + // Count how many times the document title appears + const titleCount = (markdownRendered.match(/# Markdown Test Document/gi) || []).length; + expect(titleCount).toBeLessThanOrEqual(1); + + // Count how many times a unique section appears + const uniqueSection = 'Ordered list that is wrongly numbered:'; + const uniqueSectionCount = (markdownRendered.match(new RegExp(escapeRegex(uniqueSection), 'gi')) || []).length; + expect(uniqueSectionCount).toBeLessThanOrEqual(1); + } + + // Test that the asciidoc content appears only once in the asciidoc rendered section + const asciidocRenderedMatch = htmlReport.match( + /
([\s\S]*?)<\/div>/ + ); + if (asciidocRenderedMatch) { + const asciidocRendered = asciidocRenderedMatch[1]; + // Count how many times the document title appears + const titleCount = (asciidocRendered.match(/== Bullet list/gi) || []).length; + expect(titleCount).toBeLessThanOrEqual(1); + } + + console.log('✅ Content repetition check passed - no doom-loop detected'); }); }); + +/** + * Escape special regex characters in a string + */ +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} diff --git a/test-report.html b/test-report.html index 16d7a60..2586faf 100644 --- a/test-report.html +++ b/test-report.html @@ -247,7 +247,7 @@

GC Parser Test Report

-

Generated: 4.3.2026, 13:04:08

+

Generated: 4.3.2026, 13:12:35