Browse Source

add repitition test

master
Silberengel 2 weeks ago
parent
commit
23bd727ccd
  1. 69
      src/utils/report-generator.ts
  2. 118
      test-parser-report.test.ts
  3. 2
      test-report.html

69
src/utils/report-generator.ts

@ -340,7 +340,7 @@ export function generateHTMLReport(data: ReportData): string {
<div id="md-rendered" class="tab-content"> <div id="md-rendered" class="tab-content">
<h3>Rendered HTML Output</h3> <h3>Rendered HTML Output</h3>
<div class="rendered-output"> <div class="rendered-output">
${markdown.result.content} ${cleanHtmlContent(markdown.result.content)}
</div> </div>
<details style="margin-top: 15px;"> <details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary> <summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
@ -476,7 +476,7 @@ export function generateHTMLReport(data: ReportData): string {
<div id="ad-rendered" class="tab-content"> <div id="ad-rendered" class="tab-content">
<h3>Rendered HTML Output</h3> <h3>Rendered HTML Output</h3>
<div class="rendered-output"> <div class="rendered-output">
${asciidoc.result.content} ${cleanHtmlContent(asciidoc.result.content)}
</div> </div>
<details style="margin-top: 15px;"> <details style="margin-top: 15px;">
<summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary> <summary style="cursor: pointer; color: #3498db; font-weight: 500;">View Raw HTML</summary>
@ -571,6 +571,71 @@ export function generateHTMLReport(data: ReportData): string {
</html>`; </html>`;
} }
/**
* Clean HTML content to extract only the body content
* Removes full HTML document structure if present
* Prevents infinite loops by ensuring we only extract once and handle nested structures
*/
function cleanHtmlContent(html: string): string {
if (!html || typeof html !== 'string') {
return '';
}
let cleaned = html.trim();
// Count occurrences to detect nested structures
const htmlTagCount = (cleaned.match(/<html[^>]*>/gi) || []).length;
const bodyTagCount = (cleaned.match(/<body[^>]*>/gi) || []).length;
const bodyCloseCount = (cleaned.match(/<\/body>/gi) || []).length;
// If we have multiple body tags, there might be nested structures
// Extract only the outermost body content
if (bodyTagCount > 0 && bodyCloseCount > 0) {
// Find the first <body> tag
const firstBodyIndex = cleaned.indexOf('<body');
if (firstBodyIndex !== -1) {
// Find the opening > of the first body tag
const bodyTagEnd = cleaned.indexOf('>', firstBodyIndex);
if (bodyTagEnd !== -1) {
const bodyStart = bodyTagEnd + 1;
// Find the last </body> tag (to handle nested structures)
const bodyEnd = cleaned.lastIndexOf('</body>');
if (bodyEnd > bodyStart) {
cleaned = cleaned.substring(bodyStart, bodyEnd).trim();
// Recursively clean if there are still nested structures
// But limit recursion to prevent infinite loops
const remainingBodyTags = (cleaned.match(/<body[^>]*>/gi) || []).length;
if (remainingBodyTags > 0 && remainingBodyTags < bodyTagCount) {
// There are still nested body tags, clean again but only once more
cleaned = cleaned.replace(/<body[^>]*>/gi, '');
cleaned = cleaned.replace(/<\/body>/gi, '');
}
}
}
}
}
// Remove any remaining DOCTYPE, html, head, or body tags that might be left
// Do this in a way that doesn't create nested matches
let previousLength = 0;
let iterations = 0;
while (iterations < 10 && cleaned.length !== previousLength) {
previousLength = cleaned.length;
cleaned = cleaned.replace(/<!DOCTYPE[^>]*>/gi, '');
cleaned = cleaned.replace(/<html[^>]*>/gi, '');
cleaned = cleaned.replace(/<\/html>/gi, '');
cleaned = cleaned.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '');
cleaned = cleaned.replace(/<body[^>]*>/gi, '');
cleaned = cleaned.replace(/<\/body>/gi, '');
cleaned = cleaned.trim();
iterations++;
}
return cleaned;
}
/** /**
* Escape HTML special characters * Escape HTML special characters
*/ */

118
test-parser-report.test.ts

@ -264,5 +264,123 @@ describe('Parser Test Report', () => {
if (markdownResult.hasMusicalNotation) { if (markdownResult.hasMusicalNotation) {
expect(htmlReport).toMatch(/<div class="number">Yes<\/div>.*Has Music/i); expect(htmlReport).toMatch(/<div class="number">Yes<\/div>.*Has Music/i);
} }
// ============================================
// Test for Content Repetition (Doom Loop Fix)
// ============================================
// Extract rendered output sections from the HTML report
const renderedOutputRegex = /<div class="rendered-output">([\s\S]*?)<\/div>/gi;
const renderedOutputs: string[] = [];
let match;
while ((match = renderedOutputRegex.exec(htmlReport)) !== null) {
renderedOutputs.push(match[1]);
}
// Test that we have rendered output sections
expect(renderedOutputs.length).toBeGreaterThan(0);
// Test each rendered output section for content repetition
renderedOutputs.forEach((output, index) => {
// Check for specific content that should only appear once
const testPhrases = [
'# Markdown Test Document',
'## Bullet list',
'This is a test unordered list with mixed bullets:',
'## Headers',
'## Media and Links',
'### Nostr address',
'## Tables',
'## Code blocks',
'## LateX',
];
testPhrases.forEach(phrase => {
// Count occurrences of the phrase in this output section
const occurrences = (output.match(new RegExp(escapeRegex(phrase), 'gi')) || []).length;
// Each phrase should appear at most once (or a few times if it's in different contexts)
// But if it appears many times, that indicates a repetition loop
if (occurrences > 5) {
throw new Error(
`Content repetition detected in rendered output section ${index + 1}: ` +
`"${phrase}" appears ${occurrences} times (expected ≤5). ` +
`This indicates a doom-loop in content generation.`
);
}
}); });
// Check for duplicate document structure
// If the entire document structure repeats, we'll see multiple instances of key sections
const sectionHeaders = output.match(/##\s+[^\n]+/g) || [];
const uniqueHeaders = new Set(sectionHeaders.map(h => h.trim()));
// If we have many more headers than unique ones, content is repeating
if (sectionHeaders.length > uniqueHeaders.size * 2) {
throw new Error(
`Content repetition detected in rendered output section ${index + 1}: ` +
`Found ${sectionHeaders.length} section headers but only ${uniqueHeaders.size} unique ones. ` +
`This indicates the entire document is repeating.`
);
}
// Check for repeated code block placeholders (should only appear once per code block)
const codeBlockPlaceholders: string[] = (output.match(/__CODEBLOCK_\d+__/g) || []);
const uniquePlaceholders = new Set(codeBlockPlaceholders);
// Each placeholder should appear only once
if (codeBlockPlaceholders.length !== uniquePlaceholders.size) {
const duplicates = codeBlockPlaceholders.filter((p, i) => codeBlockPlaceholders.indexOf(p) !== i);
throw new Error(
`Content repetition detected in rendered output section ${index + 1}: ` +
`Found duplicate code block placeholders: ${Array.from(new Set(duplicates)).join(', ')}. ` +
`Each placeholder should appear only once.`
);
}
// Check overall content length - if it's unreasonably long, content might be repeating
// A typical test document should be under 50KB in the rendered output
if (output.length > 100000) {
console.warn(
` Rendered output section ${index + 1} is very long (${output.length} chars). ` +
`This might indicate content repetition.`
);
}
});
// Test that the markdown content appears only once in the markdown rendered section
const markdownRenderedMatch = htmlReport.match(
/<div id="md-rendered"[\s\S]*?<div class="rendered-output">([\s\S]*?)<\/div>/
);
if (markdownRenderedMatch) {
const markdownRendered = markdownRenderedMatch[1];
// Count how many times the document title appears
const titleCount = (markdownRendered.match(/# Markdown Test Document/gi) || []).length;
expect(titleCount).toBeLessThanOrEqual(1);
// Count how many times a unique section appears
const uniqueSection = 'Ordered list that is wrongly numbered:';
const uniqueSectionCount = (markdownRendered.match(new RegExp(escapeRegex(uniqueSection), 'gi')) || []).length;
expect(uniqueSectionCount).toBeLessThanOrEqual(1);
}
// Test that the asciidoc content appears only once in the asciidoc rendered section
const asciidocRenderedMatch = htmlReport.match(
/<div id="ad-rendered"[\s\S]*?<div class="rendered-output">([\s\S]*?)<\/div>/
);
if (asciidocRenderedMatch) {
const asciidocRendered = asciidocRenderedMatch[1];
// Count how many times the document title appears
const titleCount = (asciidocRendered.match(/== Bullet list/gi) || []).length;
expect(titleCount).toBeLessThanOrEqual(1);
}
console.log('✅ Content repetition check passed - no doom-loop detected');
}); });
});
/**
* Escape special regex characters in a string
*/
function escapeRegex(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

2
test-report.html

@ -247,7 +247,7 @@
<body> <body>
<div class="container"> <div class="container">
<h1>GC Parser Test Report</h1> <h1>GC Parser Test Report</h1>
<p class="subtitle">Generated: 4.3.2026, 13:04:08</p> <p class="subtitle">Generated: 4.3.2026, 13:12:35</p>
<!-- Markdown Section --> <!-- Markdown Section -->
<div class="section"> <div class="section">

Loading…
Cancel
Save