Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | 1x 18x 18x 18x 18x 18x 1x 17x 17x 15x 2x 17x 17x 15x 2x 17x 16x 15x 1x 17x 17x 17x 17x 15x 2x 17x 17x 15x 2x 17x 17x 904x 17x 14x 3x 17x 4x | /**
* @module pipeline/validation
* @description Post-generation HTML structure validation.
*
* Validates that generated article HTML meets minimum structural requirements
* before the file is written to disk. Failures are non-fatal by default —
* the orchestrator collects validation warnings and continues.
*
* @author Hack23 AB
* @license Apache-2.0
*/
// ---------------------------------------------------------------------------
// Validation result types
// ---------------------------------------------------------------------------
/**
* Result of validating a single HTML article string.
*/
export interface ArticleValidationResult {
/** `true` when the HTML passes all required checks. */
passed: boolean;
/** Informational messages about checks that passed. */
passedChecks: string[];
/** Error messages for checks that failed (non-empty means `passed = false`). */
errors: string[];
/** Warning messages for checks that are advisory only. */
warnings: string[];
}
/**
* Options controlling which checks are enforced.
*/
export interface ValidationOptions {
/** Require a `<h1>` element (default: `true`). */
requireH1?: boolean;
/** Require at least one `<h2>` section (default: `true`). */
requireSections?: boolean;
/** Require the sources footer block and fail when absent (default: `true`). */
requireSources?: boolean;
/** Minimum word count threshold (default: `50`). */
minWordCount?: number;
/** Require valid `<html lang="…">` attribute (default: `true`). */
requireLangAttr?: boolean;
/** Require `<!DOCTYPE html>` declaration (default: `true`). */
requireDoctype?: boolean;
}
// ---------------------------------------------------------------------------
// Default options
// ---------------------------------------------------------------------------
const DEFAULT_OPTIONS: Required<ValidationOptions> = {
requireH1: true,
requireSections: true,
requireSources: true,
minWordCount: 50,
requireLangAttr: true,
requireDoctype: true,
};
// ---------------------------------------------------------------------------
// HTML structure validation
// ---------------------------------------------------------------------------
/**
* Validate the structure of a generated article HTML string.
*
* This is a lightweight regex / string-based check, not a full DOM parse.
* It is intentionally fast and dependency-free.
*
* @param html - The complete HTML string to validate.
* @param opts - Optional configuration overrides.
* @returns Structured validation result.
*/
export function validateArticleHTML(
html: string,
opts: ValidationOptions = {},
): ArticleValidationResult {
const options: Required<ValidationOptions> = { ...DEFAULT_OPTIONS, ...opts };
const errors: string[] = [];
const warnings: string[] = [];
const passedChecks: string[] = [];
if (!html || typeof html !== 'string') {
return {
passed: false,
passedChecks,
errors: ['HTML is empty or not a string'],
warnings,
};
}
// --- DOCTYPE ---
Eif (options.requireDoctype) {
if (/<!DOCTYPE\s+html>/i.test(html)) {
passedChecks.push('DOCTYPE present');
} else {
errors.push('Missing <!DOCTYPE html> declaration');
}
}
// --- lang attribute ---
Eif (options.requireLangAttr) {
if (/<html[^>]+lang=["'][a-z]{2,5}["']/i.test(html)) {
passedChecks.push('lang attribute present');
} else {
errors.push('Missing valid lang attribute on <html> element');
}
}
// --- H1 ---
if (options.requireH1) {
if (/<h1[^>]*>[\s\S]+?<\/h1>/i.test(html)) {
passedChecks.push('H1 heading present');
} else {
errors.push('Missing <h1> heading');
}
}
// --- Sections (H2) ---
Eif (options.requireSections) {
const h2Matches = html.match(/<h2[^>]*>/gi);
const h2Count = h2Matches ? h2Matches.length : 0;
if (h2Count >= 1) {
passedChecks.push(`${h2Count} <h2> section(s) present`);
} else {
errors.push('No <h2> sections found — article content may be missing');
}
}
// --- Sources footer ---
Eif (options.requireSources) {
if (/article-sources|data-sources|riksdag-regering-mcp/i.test(html)) {
passedChecks.push('Sources block present');
} else {
errors.push('Sources footer block not detected — article may lack attribution');
}
}
// --- Word count (article body only: strip <head>, <script>, <style> blocks first) ---
const bodyOnly = html
.replace(/<head[\s\S]*?<\/head>/gi, '')
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '');
const textContent = bodyOnly.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
const wordCount = textContent.split(' ').filter(w => w.length > 0).length;
if (wordCount >= options.minWordCount) {
passedChecks.push(`Word count ${wordCount} meets minimum ${options.minWordCount}`);
} else {
errors.push(
`Word count ${wordCount} is below minimum ${options.minWordCount} — article content may be too thin`,
);
}
return {
passed: errors.length === 0,
passedChecks,
errors,
warnings,
};
}
/**
* Validate a batch of articles and return a summary.
*
* @param articles - Array of `{ filename, html }` objects.
* @param opts - Optional validation configuration.
* @returns Array of per-article validation results.
*/
export function validateArticleBatch(
articles: ReadonlyArray<{ filename: string; html: string }>,
opts: ValidationOptions = {},
): Array<ArticleValidationResult & { filename: string }> {
return articles.map(({ filename, html }) => ({
filename,
...validateArticleHTML(html, opts),
}));
}
|