All files / scripts/news-types/weekly-review data-loader.ts

85.93% Statements 110/128
69.4% Branches 93/134
93.75% Functions 15/16
87.82% Lines 101/115

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344                                              8x     31x                                         21x 21x   21x                         84x       84x 84x 84x       84x     84x                                       21x     21x 21x 21x 273x 273x       21x 21x 21x 840x 840x   168x 840x 840x 840x 840x   840x                                       21x     21x 21x 168x 84x 21x 21x 21x     21x 21x 21x 756x 756x 63x 63x     21x     21x   21x                 21x 105x                 21x 21x 21x 21x 10500x   21x     21x           21x                         21x 21x   21x 32x   32x 74x     74x   74x 74x 74x                       440x 74x 292x 292x   74x   74x 74x         74x       74x 74x 50x   74x 74x 74x             32x 11x       32x                   21x   1x 1x 2x 2x                 1x 3x 3x 3x                                   21x 21x     21x   5x 16x   6x     10x     21x 15x                  
/**
 * @module news-types/weekly-review/data-loader
 * @description Data loading and processing utilities for weekly-review articles.
 * Handles CIA context loading, CSV parsing, document full-text enrichment,
 * and speech attachment.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { readFileSync, existsSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import Papa from 'papaparse';
import { MCPClient } from '../../mcp-client.js';
import {
  isPersonProfileText,
  type RawDocument,
  type CIAContext,
} from '../../data-transformers.js';
import type { MCPCallRecord } from '../../types/article.js';
 
/** Current Riksdag parties (2022 election onwards). */
const RIKSDAG_PARTIES = new Set(['M', 'SD', 'KD', 'L', 'C', 'S', 'V', 'MP']);
 
export function formatDateForSlug(date: Date = new Date()): string {
  return date.toISOString().split('T')[0] ?? '';
}
 
/**
 * Resolve the repo data directory path.
 * Works both in Node.js ESM and from compiled paths.
 */
export function repoDataDir(): string {
  try {
    const __dirname = dirname(fileURLToPath(import.meta.url));
    // From scripts/news-types/weekly-review/ → up three levels to repo root → data/
    return join(__dirname, '..', '..', '..', 'data');
  } catch {
    return join(process.cwd(), 'data');
  }
}
 
/**
 * Resolve the cia-data directory path (repo root/cia-data).
 */
function resolveCIADataDir(): string {
  try {
    const __dirname = dirname(fileURLToPath(import.meta.url));
    // From scripts/news-types/weekly-review/ → up three levels to repo root → cia-data/
    return join(__dirname, '..', '..', '..', 'cia-data');
  } catch {
    return join(process.cwd(), 'cia-data');
  }
}
 
/**
 * Parse a CSV file into an array of row objects keyed by header names.
 * Uses PapaParse for correct RFC 4180 handling (escaped quotes, embedded
 * commas, multi-line fields).
 * Returns an empty array if the file does not exist or cannot be parsed.
 */
function parseCsvFile(filePath: string): Array<Record<string, string>> {
  Iif (!existsSync(filePath)) {
    console.warn(`CIA data file not found: ${filePath}`);
    return [];
  }
  try {
    const text = readFileSync(filePath, 'utf-8');
    const result = Papa.parse<Record<string, string>>(text, {
      header: true,
      skipEmptyLines: true,
    });
    Iif (result.errors.length > 0) {
      console.warn(`CSV parsing warnings for ${filePath}:`, result.errors);
    }
    return result.data;
  } catch (err) {
    console.error(`Failed to parse CSV ${filePath}:`, err);
    return [];
  }
}
 
// RIKSDAG_PARTIES is defined at the top of this file
 
/**
 * Load CIA intelligence context from real CSV files in cia-data/.
 * Sources:
 *   • cia-data/party/view_party_performance_metrics_sample.csv   – win rates, documents, rebel rate
 *   • cia-data/view_riksdagen_party_summary_sample.csv           – current seat counts
 *   • cia-data/party/distribution_coalition_alignment.csv        – inter-party alignment
 *   • cia-data/view_riksdagen_committee_decisions.csv            – committee decision outcomes
 *
 * Returns a populated CIAContext or a minimal fallback when files are missing.
 */
export function loadCIAContext(): CIAContext {
  const ciaDir = resolveCIADataDir();
 
  // ── 1. Seat counts from view_riksdagen_party_summary_sample.csv ──────────
  const seatMap = new Map<string, number>();
  const partySummaryRows = parseCsvFile(join(ciaDir, 'view_riksdagen_party_summary_sample.csv'));
  for (const row of partySummaryRows) {
    const party = row['party']?.trim();
    if (party) seatMap.set(party, parseInt(row['total_active_parliament'] ?? '0', 10) || 0);
  }
 
  // ── 2. Party performance from view_party_performance_metrics_sample.csv ──
  const partyPerformance: CIAContext['partyPerformance'] = [];
  const partyMetricsRows = parseCsvFile(join(ciaDir, 'party', 'view_party_performance_metrics_sample.csv'));
  for (const row of partyMetricsRows) {
    const id = row['party']?.trim() ?? '';
    if (!RIKSDAG_PARTIES.has(id)) continue;
 
    const avgWinRate   = parseFloat(row['avg_win_rate']   ?? '0') || 0;
    const avgRebelRate = parseFloat(row['avg_rebel_rate'] ?? '0') || 0;
    const docsLastYear = parseInt(row['documents_last_year'] ?? '0', 10) || 0;
    const ministers    = parseInt(row['current_ministers']   ?? '0', 10) || 0;
    const perfLevel    = row['performance_level']?.trim() ?? '';
 
    partyPerformance.push({
      id,
      partyName: row['party_name']?.trim() ?? id,
      metrics: {
        seats:             seatMap.get(id) ?? 0,
        // avg_win_rate is 0-100 percentage (e.g. M=86.49, S=43.40)
        successRate:       avgWinRate,
        motionsSubmitted:  docsLastYear,
        motionsPassed:     Math.round(avgWinRate * docsLastYear / 100),
        // avg_rebel_rate is a 0-1 decimal ratio (e.g. S=0.06 → 6% rebel rate)
        cohesionScore:     Math.round((1 - avgRebelRate) * 100),
      },
      trends: {
        supportTrend:  ministers > 0 ? 'stable' : (avgWinRate < 50 ? 'declining' : 'stable'),
        activityTrend: perfLevel === 'EXCELLENT' ? 'increasing' : perfLevel === 'BELOW_AVERAGE' ? 'declining' : 'stable',
      },
    });
  }
 
  // ── 3. Coalition stability from distribution_coalition_alignment.csv ─────
  const coalignRows = parseCsvFile(join(ciaDir, 'party', 'distribution_coalition_alignment.csv'));
 
  // Government bloc: M + KD + L + SD (SD provides confidence-and-supply support)
  const GOV_PARTIES = new Set(['M', 'KD', 'L', 'SD']);
  const govSeats = partyPerformance
    .filter(p => GOV_PARTIES.has(p.id))
    .reduce((s, p) => s + p.metrics.seats, 0);
  const totalSeats = 349;
  const majorityNeeded = Math.floor(totalSeats / 2) + 1; // 175
  const majorityMargin = govSeats - majorityNeeded;
 
  // Average alignment among the three formal government parties (M, KD, L)
  const coreGovPairs = new Set(['M-KD', 'M-L', 'KD-L', 'KD-M', 'L-M', 'L-KD']);
  let alignmentSum = 0; let alignmentCount = 0;
  for (const row of coalignRows) {
    const pair = `${row['party1']?.trim() ?? ''}-${row['party2']?.trim() ?? ''}`;
    if (coreGovPairs.has(pair)) {
      alignmentSum += parseFloat(row['alignment_rate'] ?? '0') || 0;
      alignmentCount++;
    }
  }
  const stabilityScore = alignmentCount > 0
    ? Math.round((alignmentSum / alignmentCount) * 100)
    : 75;
  const riskLevel = majorityMargin <= 0 ? 'high' : majorityMargin <= 2 ? 'moderate' : 'low';
 
  const coalitionStability: CIAContext['coalitionStability'] = {
    stabilityScore,
    riskLevel,
    // Base 20% defection probability, reduced 3% per seat of margin, minimum 5%
    defectionProbability: Math.max(5, Math.round(20 - majorityMargin * 3)),
    majorityMargin: Math.max(0, majorityMargin),
  };
 
  // ── 4. Voting patterns from coalition alignment (top 5 party pairs) ───────
  const votingPatterns: CIAContext['votingPatterns'] = {
    keyIssues: coalignRows.slice(0, 5).map(row => ({
      topic:                `${row['party1']?.trim() ?? ''}-${row['party2']?.trim() ?? ''} alignment`,
      coalitionAlignment:   Math.round((parseFloat(row['alignment_rate'] ?? '0') || 0) * 100),
      oppositionAlignment:  Math.round((1 - (parseFloat(row['alignment_rate'] ?? '0') || 0)) * 100),
      crossPartyVotes:      parseInt(row['aligned_votes'] ?? '0', 10) || 0,
    })),
  };
 
  // ── 5. Motion denial rate from committee decision outcomes ────────────────
  let overallMotionDenialRate = 96; // historical baseline from CIA data
  const decisionsRows = parseCsvFile(join(ciaDir, 'view_riksdagen_committee_decisions.csv'));
  Eif (decisionsRows.length > 0) {
    const committeeWins = decisionsRows.filter(r =>
      r['winner']?.trim().toLowerCase() === 'utskottet'
    ).length;
    overallMotionDenialRate = Math.round((committeeWins / decisionsRows.length) * 100);
  }
 
  console.log(
    `  📊 CIA CSV context: ${partyPerformance.length} parties, ` +
    `gov seats ${govSeats}/${totalSeats} (margin ${majorityMargin}), ` +
    `stability ${stabilityScore}/100, denial rate ${overallMotionDenialRate}%`
  );
 
  return { partyPerformance, coalitionStability, votingPatterns, overallMotionDenialRate };
}
 
/**
 * Enrich a flat list of documents with full text via get_dokument_innehall.
 * Mutates each document in place; never throws — failures are logged and skipped.
 */
export async function enrichWithFullText(
  client: MCPClient,
  documents: RawDocument[],
  mcpCalls: MCPCallRecord[],
  concurrency = 3,
): Promise<void> {
  console.log(`  📖 Enriching ${documents.length} documents with full text (concurrency ${concurrency})...`);
  let enriched = 0;
 
  for (let i = 0; i < documents.length; i += concurrency) {
    const batch = documents.slice(i, i + concurrency);
 
    await Promise.allSettled(batch.map(async (doc) => {
      const dokId = (doc as Record<string, string>).dok_id
        ?? (doc as Record<string, string>).dokumentnamn
        ?? (doc as Record<string, string>).id;
      Iif (!dokId) return;
 
      try {
        const details = await client.fetchDocumentDetails(dokId, true);
        mcpCalls.push({ tool: 'get_dokument_innehall', result: details });
 
        // Merge full text fields into document.
        // get_dokument_innehall returns: { text, snippet, fulltext_available, ... }
        //   details['text']    → raw Riksdag dump (metadata + embedded HTML) — use as fullContent
        //   details['snippet'] → 400-char excerpt — use as summary fallback
        // Legacy fields (fullText, html, summary, notis) are NOT returned by the
        // current MCP server but kept as fallbacks for compatibility.
        // Also: some documents return politician profile text (MP status like
        // "Tjänstgörande riksdagsledamot..." or "Avliden YYYY-MM-DD...") in their
        // notis/summary/fullText fields — discard these to prevent them from
        // appearing as article content.
        const str = (v: unknown): string => typeof v === 'string' ? v : '';
        const sanitize = (s: unknown): string => {
          const val = str(s).trim();
          return isPersonProfileText(val) ? '' : val;
        };
        const d = doc as Record<string, unknown>;
        // Primary: MCP returns 'text' (raw dump with embedded HTML from Riksdag)
        const rawText = str(details['text']).trim();
        d['fullText'] = sanitize(details['fullText'])
          || sanitize(details['summary'])
          || sanitize(details['notis'])
          || '';
        // Use raw 'text' as fullContent if it's substantial; fallback to legacy 'html'
        d['fullContent'] = rawText.length > 100
          ? rawText
          : str(details['html']);
        // Propagate summary: prefer MCP 'snippet', fall back to legacy fields
        const snippet = sanitize(details['snippet']);
        if (!d['summary']) {
          d['summary'] = snippet || sanitize(details['summary']) || '';
        }
        Iif (!d['notis'] && details['notis']) d['notis'] = sanitize(details['notis']);
        d['contentFetched'] = true;
        enriched++;
      } catch (err: unknown) {
        console.error(`  ⚠ Failed to fetch full text for ${dokId}:`, (err as Error).message);
      }
    }));
 
    // Small delay between batches to avoid rate limiting
    if (i + concurrency < documents.length) {
      await new Promise<void>(r => setTimeout(r, 300));
    }
  }
 
  console.log(`  ✅ Enriched ${enriched}/${documents.length} documents with full text`);
}
 
/**
 * Attach related speeches to documents that share the same dokId.
 */
export function attachSpeechesToDocuments(
  documents: RawDocument[],
  speeches: Array<Record<string, unknown>>,
): void {
  if (speeches.length === 0) return;
  // Build a loose index: dok_id → speeches
  const speechIndex = new Map<string, Array<{ talare?: string; parti?: string; text?: string; anforande_nummer?: string }>>();
  for (const s of speeches) {
    const ref = String(s['intressent_id'] ?? s['dok_id'] ?? s['rel_dok_id'] ?? '');
    Eif (!ref) continue;
    if (!speechIndex.has(ref)) speechIndex.set(ref, []);
    speechIndex.get(ref)!.push({
      talare: s['talare'] as string | undefined,
      parti: s['parti'] as string | undefined,
      text: (s['anforande_text'] as string | undefined)?.slice(0, 300),
      anforande_nummer: s['anforande_nummer'] as string | undefined,
    });
  }
  for (const doc of documents) {
    const dokId = (doc as Record<string, string>).dok_id ?? '';
    const related = speechIndex.get(dokId);
    Iif (related && related.length > 0) {
      (doc as Record<string, unknown>).speeches = related;
    }
  }
}
 
/**
 * Normalize CIAContext so defectionProbability is in [0, 1].
 *
 * risk-analysis.ts multiplies it by 100, so out-of-range values can
 * explode scores. Expected input formats:
 * - (0, 1] — already a proper probability fraction; kept as-is.
 *            Note: exactly 1.0 is treated as 100% (not as 1% whole-percent).
 * - (1, ∞) — treated as a whole-percent (loadCIAContext returns min 5,
 *             e.g. 50 means 50% → normalized to 0.5); clamped to 1.
 * - Non-finite or ≤ 0 — coerced to 0 (no defection risk).
 */
export function normalizedCIAContext(ctx: CIAContext): CIAContext {
  const defProb = ctx.coalitionStability?.defectionProbability;
  Iif (typeof defProb !== 'number') return ctx;
 
  let normalized: number;
  if (!Number.isFinite(defProb) || defProb <= 0) {
    // Non-finite or non-positive: no defection risk.
    normalized = 0;
  } else if (defProb <= 1) {
    // Already a fraction in (0, 1]: keep as-is (1.0 = 100% probability).
    normalized = defProb;
  } else {
    // Whole-percent value (e.g. loadCIAContext min 5): convert to fraction and clamp.
    normalized = Math.min(1, defProb / 100);
  }
 
  if (normalized === defProb) return ctx;
  return {
    ...ctx,
    coalitionStability: {
      ...ctx.coalitionStability!,
      defectionProbability: normalized,
    },
  };
}