All files / scripts check-analysis-language.ts

62.16% Statements 46/74
60.52% Branches 23/38
72.72% Functions 8/11
62.12% Lines 41/66

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256                                      1x                                 1x     1x                                 15x           15x     15x     15x       15x           15x         15x               14x 221x                       11x 11x 11x   11x 214x 11x   11x                           12x     12x   12x 20x   20x   2x   18x   18x     15x     15x     14x   14x         12x 12x                               7x 7x   7x 8x     8x 2x                   7x                                                                                                                         1x      
/**
 * Analysis Language Checker — English-only enforcement for analysis artifacts
 *
 * Enforces English-only prose in all analysis artifacts under analysis/daily by scanning
 * for Swedish function words and political-vocabulary tokens. Exempts translation outputs
 * (executive-brief_<lang>.md), Pass-1 snapshots, and data-download-manifest.md.
 *
 * @module scripts/check-analysis-language
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { readdirSync, readFileSync, statSync } from 'node:fs';
import { join, relative } from 'node:path';
 
/**
 * Swedish-marker tokens — unambiguous Swedish function words and political vocabulary
 * that essentially never appear in English prose. Used for density-based language detection.
 */
const SWEDISH_MARKERS = new Set([
  // Function words (unambiguous Swedish — never valid English)
  'och', 'att', 'för', 'inte', 'är', 'den', 'det',
  'har', 'hade', 'kommer', 'skall', 'måste', 'enligt', 'samt',
  'därför', 'därmed', 'genom', 'vidare', 'följande', 'sina',
  'sitt', 'vilket', 'vilken', 'något', 'några', 'denna', 'dessa',
  'varje', 'övriga', 'övrig', 'tillika', 'därutöver', 'härmed', 'härav',
  
  // Swedish political vocabulary (common-noun forms only; proper nouns like
  // Riksdagen / Regeringen are explicitly allowed verbatim in English prose,
  // so they must NOT appear here to avoid false positives)
  'propositionen', 'utskottet',
  'föreslår', 'föreslagit', 'införande', 'införa', 'införs', 'införts',
  'säkerhetshot', 'utvisning', 'utvisa', 'beslutet',
]);
 
/** Minimum threshold for Swedish density (5%) */
const SWEDISH_DENSITY_THRESHOLD = 0.05;
 
/** Minimum absolute Swedish-marker count to trigger a violation (avoid false positives on short snippets) */
const MIN_SWEDISH_MARKERS = 5;
 
/**
 * Strip YAML frontmatter, code fences, inline code, and allowed-Swedish
 * quoted/source-material sections from Markdown content. Returns the raw
 * analytical prose body for language detection.
 *
 * The prompt contract (see `.github/prompts/04-analysis-pipeline.md` and
 * `analysis/methodologies/per-artifact-methodologies.md`) explicitly permits
 * Swedish text in two narrow contexts:
 *   1. Markdown blockquotes (`>`) — attributed source quotations.
 *   2. Lines beginning with `Source title:` / `Källa:` — verbatim source titles.
 * These ranges are stripped before density measurement so that a long Swedish
 * source quote cannot fail the gate even when the analytical prose around it
 * is fully English.
 */
export function stripMarkdownCodeAndFrontmatter(content: string): string {
  let body = content;
  
  // Remove YAML frontmatter (---\n...\n---) — anchor at start of file only.
  // Do NOT use the `m` flag; otherwise `^---` would also match a thematic
  // break later in the body and strip everything between two `---` rules,
  // hiding Swedish prose from the language check.
  body = body.replace(/^---\r?\n[\s\S]*?\r?\n---\r?\n/, '');
  
  // Remove code fences (```...```)
  body = body.replace(/```[\s\S]*?```/g, '');
  
  // Remove inline code (`...`)
  body = body.replace(/`[^`]+`/g, '');
 
  // Remove Markdown blockquote lines (attributed source quotations may legitimately
  // remain in Swedish per the prompt contract).
  body = body.replace(/^[ \t]*>[^\n]*$/gm, '');
 
  // Remove verbatim-source-title lines (e.g. `Source title: Proposition om …`,
  // `Källa: Riksdagens protokoll …`, `Original title: …`). Match the explicit
  // source-attribution labels only — a bare `Title:` label is **not** exempted
  // (otherwise Swedish prose on a `Title:` line would silently pass the gate).
  body = body.replace(
    /^[ \t]*(?:[-*+][ \t]+)?\**(?:Source title|Källa|Källtitel|Original title)\**[ \t]*:[^\n]*$/gim,
    ''
  );
 
  return body;
}
 
/**
 * Tokenize Markdown prose into lowercase words (A-Z, À-ž, Swedish å ä ö).
 * Returns an array of lowercase word tokens.
 */
export function tokenizeWords(text: string): string[] {
  const matches = text.match(/[A-Za-zÀ-žÅÄÖåäö]+/g);
  return matches ? matches.map(w => w.toLowerCase()) : [];
}
 
/**
 * Calculate Swedish-marker density and count for a Markdown file.
 * Returns { totalWords, swedishMarkerCount, density }.
 */
export function calculateSwedishDensity(filepath: string): {
  totalWords: number;
  swedishMarkerCount: number;
  density: number;
} {
  const content = readFileSync(filepath, 'utf-8');
  const prose = stripMarkdownCodeAndFrontmatter(content);
  const words = tokenizeWords(prose);
  
  const totalWords = words.length;
  const swedishMarkerCount = words.filter(w => SWEDISH_MARKERS.has(w)).length;
  const density = totalWords > 0 ? swedishMarkerCount / totalWords : 0;
  
  return { totalWords, swedishMarkerCount, density };
}
 
/**
 * Recursively find all .md files in a directory, excluding:
 * - executive-brief_<lang>.md (translation outputs)
 * - article.<lang>.md (forbidden — caught by validate-file-ownership)
 * - pass1/ subdirectories (Pass-1 snapshots)
 * - full-text/ subdirectories (raw downloaded Swedish source material — not
 *   generated analysis; failing it on Swedish density would be incorrect)
 * - data-download-manifest.md (exempt — heavy Swedish source titles)
 * - README.md (per-folder index, not aggregated into article.md)
 */
export function findAnalysisMarkdownFiles(dir: string): string[] {
  const files: string[] = [];
  
  function walk(currentDir: string) {
    const entries = readdirSync(currentDir, { withFileTypes: true });
    
    for (const entry of entries) {
      const fullPath = join(currentDir, entry.name);
      
      if (entry.isDirectory()) {
        // Skip pass1/ snapshots and full-text/ raw source material.
        Eif (entry.name === 'pass1' || entry.name === 'full-text') continue;
        walk(fullPath);
      } else Eif (entry.isFile() && entry.name.endsWith('.md')) {
        // Skip executive-brief_<lang>.md (translation outputs)
        if (/^executive-brief_[a-z]{2}\.md$/.test(entry.name)) continue;
 
        // Skip article.<lang>.md (now forbidden — caught by ownership validator)
        Iif (/^article\.[a-z]{2}\.md$/.test(entry.name)) continue;
 
        // Skip data-download-manifest.md (exempt — heavy Swedish source titles)
        if (entry.name === 'data-download-manifest.md') continue;
 
        // Skip per-folder README.md (index file, not aggregated into article.md)
        Iif (entry.name === 'README.md') continue;
 
        files.push(fullPath);
      }
    }
  }
  
  walk(dir);
  return files;
}
 
/**
 * Validate that all analysis artifacts in the given directory are English-only.
 * Returns an array of violation objects { filepath, totalWords, swedishMarkerCount, density }.
 */
export interface LanguageViolation {
  filepath: string;
  relpath: string;
  totalWords: number;
  swedishMarkerCount: number;
  density: number;
}
 
export function validateAnalysisLanguage(analysisDir: string): LanguageViolation[] {
  const violations: LanguageViolation[] = [];
  const files = findAnalysisMarkdownFiles(analysisDir);
  
  for (const filepath of files) {
    const { totalWords, swedishMarkerCount, density } = calculateSwedishDensity(filepath);
    
    // Violation: density > threshold AND absolute count >= minimum
    if (density > SWEDISH_DENSITY_THRESHOLD && swedishMarkerCount >= MIN_SWEDISH_MARKERS) {
      violations.push({
        filepath,
        relpath: relative(process.cwd(), filepath),
        totalWords,
        swedishMarkerCount,
        density,
      });
    }
  }
  
  return violations;
}
 
/**
 * Format a violation table for console output.
 */
export function formatViolationTable(violations: LanguageViolation[]): string {
  if (violations.length === 0) return '';
  
  const header = '| File | Words | Swedish | Density |';
  const separator = '|------|------:|--------:|--------:|';
  const rows = violations.map(v =>
    `| ${v.relpath} | ${v.totalWords} | ${v.swedishMarkerCount} | ${v.density.toFixed(3)} |`
  );
  
  return [header, separator, ...rows].join('\n');
}
 
/**
 * CLI entry point: check analysis language for a given directory.
 *
 * Usage:
 *   npx tsx scripts/check-analysis-language.ts [analysis-dir]
 *   npm run check:analysis-language -- [analysis-dir]
 *
 * When `analysis-dir` is omitted the script defaults to scanning the entire
 * `analysis/daily/` tree so callers (e.g. `npm run check:analysis-language`)
 * can run a repo-wide audit without remembering CLI arguments. The analysis
 * gate (`05-analysis-gate.md`) always passes the per-run `$ANALYSIS_DIR`.
 */
export async function main() {
  const args = process.argv.slice(2);
  const analysisDir = args[0] && args[0].trim().length > 0 ? args[0] : 'analysis/daily';
  
  // Check that the directory exists
  try {
    const stats = statSync(analysisDir);
    if (!stats.isDirectory()) {
      console.error(`❌ analysis-language: ${analysisDir} is not a directory`);
      process.exit(1);
    }
  } catch (err) {
    console.error(`❌ analysis-language: ${analysisDir} does not exist`);
    process.exit(1);
  }
  
  const violations = validateAnalysisLanguage(analysisDir);
  
  if (violations.length > 0) {
    console.error(`❌ analysis-language: ${violations.length} violation(s) detected (Swedish density > ${SWEDISH_DENSITY_THRESHOLD})\n`);
    console.error(formatViolationTable(violations));
    process.exit(1);
  }
  
  // Count total files checked
  const totalFiles = findAnalysisMarkdownFiles(analysisDir).length;
  console.log(`✅ analysis-language: 0 violations across ${totalFiles} files (English-only)`);
  process.exit(0);
}
 
// Run CLI if invoked directly
Iif (import.meta.url === `file://${process.argv[1]}`) {
  main();
}