weekly-review data-loader.ts

85.93% Statements 110/128
68.65% Branches 92/134
93.75% Functions 15/16
87.82% Lines 101/115
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8x
 
 
31x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21x
21x
 
21x
 
 
 
 
 
 
 
 
 
 
 
 
84x
 
 
 
84x
84x
84x
 
 
 
84x
 
 
84x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21x
 
 
21x
21x
21x
273x
273x
 
 
 
21x
21x
21x
840x
840x
 
168x
840x
840x
840x
840x
 
840x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21x
 
 
21x
21x
168x
84x
21x
21x
21x
 
 
21x
21x
21x
756x
756x
63x
63x
 
 
21x
 
 
21x
 
21x
 
 
 
 
 
 
 
 
21x
105x
 
 
 
 
 
 
 
 
21x
21x
21x
21x
1050000x
 
21x
 
 
21x
 
 
 
 
 
21x
 
 
 
 
 
 
 
 
 
 
 
 
21x
21x
 
21x
32x
 
32x
74x
 
 
74x
 
74x
74x
74x
 
 
 
 
 
 
 
 
 
 
 
440x
74x
292x
292x
 
74x
 
74x
74x
 
 
 
 
74x
 
 
 
74x
74x
50x
 
74x
74x
74x
 
 
 
 
 
 
32x
11x
 
 
 
32x
 
 
 
 
 
 
 
 
 
21x
 
1x
1x
2x
2x
 
 
 
 
 
 
 
 
1x
3x
3x
3x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21x
21x
 
 
21x
 
5x
16x
 
6x
 
 
10x
 
 
21x
15x
 
 
 
 
 
 
 
 
  /**
 * @module news-types/weekly-review/data-loader
 * @description Data loading and processing utilities for weekly-review articles.
 * Handles CIA context loading, CSV parsing, document full-text enrichment,
 * and speech attachment.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { readFileSync, existsSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import Papa from 'papaparse';
import { MCPClient } from '../../mcp-client.js';
import {
  isPersonProfileText,
  type RawDocument,
  type CIAContext,
} from '../../data-transformers.js';
import type { MCPCallRecord } from '../../types/article.js';
 
/** Current Riksdag parties (2022 election onwards). */
const RIKSDAG_PARTIES = new Set(['M', 'SD', 'KD', 'L', 'C', 'S', 'V', 'MP']);
 
export function formatDateForSlug(date: Date = new Date()): string {
  return date.toISOString().split('T')[0] ?? '';
}
 
/**
 * Resolve the repo data directory path.
 * Works both in Node.js ESM and from compiled paths.
 */
export function repoDataDir(): string {
  try {
    const __dirname = dirname(fileURLToPath(import.meta.url));
    // From scripts/news-types/weekly-review/ → up three levels to repo root → data/
    return join(__dirname, '..', '..', '..', 'data');
  } catch {
    return join(process.cwd(), 'data');
  }
}
 
/**
 * Resolve the cia-data directory path (repo root/cia-data).
 */
function resolveCIADataDir(): string {
  try {
    const __dirname = dirname(fileURLToPath(import.meta.url));
    // From scripts/news-types/weekly-review/ → up three levels to repo root → cia-data/
    return join(__dirname, '..', '..', '..', 'cia-data');
  } catch {
    return join(process.cwd(), 'cia-data');
  }
}
 
/**
 * Parse a CSV file into an array of row objects keyed by header names.
 * Uses PapaParse for correct RFC 4180 handling (escaped quotes, embedded
 * commas, multi-line fields).
 * Returns an empty array if the file does not exist or cannot be parsed.
 */
function parseCsvFile(filePath: string): Array<Record<string, string>> {
  Iif (!existsSync(filePath)) {
    console.warn(`CIA data file not found: ${filePath}`);
    return [];
  }
  try {
    const text = readFileSync(filePath, 'utf-8');
    const result = Papa.parse<Record<string, string>>(text, {
      header: true,
      skipEmptyLines: true,
    });
    Iif (result.errors.length > 0) {
      console.warn(`CSV parsing warnings for ${filePath}:`, result.errors);
    }
    return result.data;
  } catch (err) {
    console.error(`Failed to parse CSV ${filePath}:`, err);
    return [];
  }
}
 
// RIKSDAG_PARTIES is defined at the top of this file
 
/**
 * Load CIA intelligence context from real CSV files in cia-data/.
 * Sources:
 *   • cia-data/party/view_party_performance_metrics_sample.csv   – win rates, documents, rebel rate
 *   • cia-data/view_riksdagen_party_summary_sample.csv           – current seat counts
 *   • cia-data/party/distribution_coalition_alignment.csv        – inter-party alignment
 *   • cia-data/view_riksdagen_committee_decisions.csv            – committee decision outcomes
 *
 * Returns a populated CIAContext or a minimal fallback when files are missing.
 */
export function loadCIAContext(): CIAContext {
  const ciaDir = resolveCIADataDir();
 
  // ── 1. Seat counts from view_riksdagen_party_summary_sample.csv ──────────
  const seatMap = new Map<string, number>();
  const partySummaryRows = parseCsvFile(join(ciaDir, 'view_riksdagen_party_summary_sample.csv'));
  for (const row of partySummaryRows) {
    const party = row['party']?.trim();
    if (party) seatMap.set(party, parseInt(row['total_active_parliament'] ?? '0', 10) || 0);
  }
 
  // ── 2. Party performance from view_party_performance_metrics_sample.csv ──
  const partyPerformance: CIAContext['partyPerformance'] = [];
  const partyMetricsRows = parseCsvFile(join(ciaDir, 'party', 'view_party_performance_metrics_sample.csv'));
  for (const row of partyMetricsRows) {
    const id = row['party']?.trim() ?? '';
    if (!RIKSDAG_PARTIES.has(id)) continue;
 
    const avgWinRate   = parseFloat(row['avg_win_rate']   ?? '0') || 0;
    const avgRebelRate = parseFloat(row['avg_rebel_rate'] ?? '0') || 0;
    const docsLastYear = parseInt(row['documents_last_year'] ?? '0', 10) || 0;
    const ministers    = parseInt(row['current_ministers']   ?? '0', 10) || 0;
    const perfLevel    = row['performance_level']?.trim() ?? '';
 
    partyPerformance.push({
      id,
      partyName: row['party_name']?.trim() ?? id,
      metrics: {
        seats:             seatMap.get(id) ?? 0,
        // avg_win_rate is 0-100 percentage (e.g. M=86.49, S=43.40)
        successRate:       avgWinRate,
        motionsSubmitted:  docsLastYear,
        motionsPassed:     Math.round(avgWinRate * docsLastYear / 100),
        // avg_rebel_rate is a 0-1 decimal ratio (e.g. S=0.06 → 6% rebel rate)
        cohesionScore:     Math.round((1 - avgRebelRate) * 100),
      },
      trends: {
        supportTrend:  ministers > 0 ? 'stable' : (avgWinRate < 50 ? 'declining' : 'stable'),
        activityTrend: perfLevel === 'EXCELLENT' ? 'increasing' : perfLevel === 'BELOW_AVERAGE' ? 'declining' : 'stable',
      },
    });
  }
 
  // ── 3. Coalition stability from distribution_coalition_alignment.csv ─────
  const coalignRows = parseCsvFile(join(ciaDir, 'party', 'distribution_coalition_alignment.csv'));
 
  // Government bloc: M + KD + L + SD (SD provides confidence-and-supply support)
  const GOV_PARTIES = new Set(['M', 'KD', 'L', 'SD']);
  const govSeats = partyPerformance
    .filter(p => GOV_PARTIES.has(p.id))
    .reduce((s, p) => s + p.metrics.seats, 0);
  const totalSeats = 349;
  const majorityNeeded = Math.floor(totalSeats / 2) + 1; // 175
  const majorityMargin = govSeats - majorityNeeded;
 
  // Average alignment among the three formal government parties (M, KD, L)
  const coreGovPairs = new Set(['M-KD', 'M-L', 'KD-L', 'KD-M', 'L-M', 'L-KD']);
  let alignmentSum = 0; let alignmentCount = 0;
  for (const row of coalignRows) {
    const pair = `${row['party1']?.trim() ?? ''}-${row['party2']?.trim() ?? ''}`;
    if (coreGovPairs.has(pair)) {
      alignmentSum += parseFloat(row['alignment_rate'] ?? '0') || 0;
      alignmentCount++;
    }
  }
  const stabilityScore = alignmentCount > 0
    ? Math.round((alignmentSum / alignmentCount) * 100)
    : 75;
  const riskLevel = majorityMargin <= 0 ? 'high' : majorityMargin <= 2 ? 'moderate' : 'low';
 
  const coalitionStability: CIAContext['coalitionStability'] = {
    stabilityScore,
    riskLevel,
    // Base 20% defection probability, reduced 3% per seat of margin, minimum 5%
    defectionProbability: Math.max(5, Math.round(20 - majorityMargin * 3)),
    majorityMargin: Math.max(0, majorityMargin),
  };
 
  // ── 4. Voting patterns from coalition alignment (top 5 party pairs) ───────
  const votingPatterns: CIAContext['votingPatterns'] = {
    keyIssues: coalignRows.slice(0, 5).map(row => ({
      topic:                `${row['party1']?.trim() ?? ''}-${row['party2']?.trim() ?? ''} alignment`,
      coalitionAlignment:   Math.round((parseFloat(row['alignment_rate'] ?? '0') || 0) * 100),
      oppositionAlignment:  Math.round((1 - (parseFloat(row['alignment_rate'] ?? '0') || 0)) * 100),
      crossPartyVotes:      parseInt(row['aligned_votes'] ?? '0', 10) || 0,
    })),
  };
 
  // ── 5. Motion denial rate from committee decision outcomes ────────────────
  let overallMotionDenialRate = 96; // historical baseline from CIA data
  const decisionsRows = parseCsvFile(join(ciaDir, 'view_riksdagen_committee_decisions.csv'));
  Eif (decisionsRows.length > 0) {
    const committeeWins = decisionsRows.filter(r =>
      r['winner']?.trim().toLowerCase() === 'utskottet'
    ).length;
    overallMotionDenialRate = Math.round((committeeWins / decisionsRows.length) * 100);
  }
 
  console.log(
    `  📊 CIA CSV context: ${partyPerformance.length} parties, ` +
    `gov seats ${govSeats}/${totalSeats} (margin ${majorityMargin}), ` +
    `stability ${stabilityScore}/100, denial rate ${overallMotionDenialRate}%`
  );
 
  return { partyPerformance, coalitionStability, votingPatterns, overallMotionDenialRate };
}
 
/**
 * Enrich a flat list of documents with full text via get_dokument_innehall.
 * Mutates each document in place; never throws — failures are logged and skipped.
 */
export async function enrichWithFullText(
  client: MCPClient,
  documents: RawDocument[],
  mcpCalls: MCPCallRecord[],
  concurrency = 3,
): Promise<void> {
  console.log(`  📖 Enriching ${documents.length} documents with full text (concurrency ${concurrency})...`);
  let enriched = 0;
 
  for (let i = 0; i < documents.length; i += concurrency) {
    const batch = documents.slice(i, i + concurrency);
 
    await Promise.allSettled(batch.map(async (doc) => {
      const dokId = (doc as Record<string, string>).dok_id
        ?? (doc as Record<string, string>).dokumentnamn
        ?? (doc as Record<string, string>).id;
      Iif (!dokId) return;
 
      try {
        const details = await client.fetchDocumentDetails(dokId, true);
        mcpCalls.push({ tool: 'get_dokument_innehall', result: details });
 
        // Merge full text fields into document.
        // get_dokument_innehall returns: { text, snippet, fulltext_available, ... }
        //   details['text']    → raw Riksdag dump (metadata + embedded HTML) — use as fullContent
        //   details['snippet'] → 400-char excerpt — use as summary fallback
        // Legacy fields (fullText, html, summary, notis) are NOT returned by the
        // current MCP server but kept as fallbacks for compatibility.
        // Also: some documents return politician profile text (MP status like
        // "Tjänstgörande riksdagsledamot..." or "Avliden YYYY-MM-DD...") in their
        // notis/summary/fullText fields — discard these to prevent them from
        // appearing as article content.
        const str = (v: unknown): string => typeof v === 'string' ? v : '';
        const sanitize = (s: unknown): string => {
          const val = str(s).trim();
          return isPersonProfileText(val) ? '' : val;
        };
        const d = doc as Record<string, unknown>;
        // Primary: MCP returns 'text' (raw dump with embedded HTML from Riksdag)
        const rawText = str(details['text']).trim();
        d['fullText'] = sanitize(details['fullText'])
          || sanitize(details['summary'])
          || sanitize(details['notis'])
          || '';
        // Use raw 'text' as fullContent if it's substantial; fallback to legacy 'html'
        d['fullContent'] = rawText.length > 100
          ? rawText
          : str(details['html']);
        // Propagate summary: prefer MCP 'snippet', fall back to legacy fields
        const snippet = sanitize(details['snippet']);
        if (!d['summary']) {
          d['summary'] = snippet || sanitize(details['summary']) || '';
        }
        Iif (!d['notis'] && details['notis']) d['notis'] = sanitize(details['notis']);
        d['contentFetched'] = true;
        enriched++;
      } catch (err: unknown) {
        console.error(`  ⚠ Failed to fetch full text for ${dokId}:`, (err as Error).message);
      }
    }));
 
    // Small delay between batches to avoid rate limiting
    if (i + concurrency < documents.length) {
      await new Promise<void>(r => setTimeout(r, 300));
    }
  }
 
  console.log(`  ✅ Enriched ${enriched}/${documents.length} documents with full text`);
}
 
/**
 * Attach related speeches to documents that share the same dokId.
 */
export function attachSpeechesToDocuments(
  documents: RawDocument[],
  speeches: Array<Record<string, unknown>>,
): void {
  if (speeches.length === 0) return;
  // Build a loose index: dok_id → speeches
  const speechIndex = new Map<string, Array<{ talare?: string; parti?: string; text?: string; anforande_nummer?: string }>>();
  for (const s of speeches) {
    const ref = String(s['intressent_id'] ?? s['dok_id'] ?? s['rel_dok_id'] ?? '');
    Eif (!ref) continue;
    if (!speechIndex.has(ref)) speechIndex.set(ref, []);
    speechIndex.get(ref)!.push({
      talare: s['talare'] as string | undefined,
      parti: s['parti'] as string | undefined,
      text: (s['anforande_text'] as string | undefined)?.slice(0, 300),
      anforande_nummer: s['anforande_nummer'] as string | undefined,
    });
  }
  for (const doc of documents) {
    const dokId = (doc as Record<string, string>).dok_id ?? '';
    const related = speechIndex.get(dokId);
    Iif (related && related.length > 0) {
      (doc as Record<string, unknown>).speeches = related;
    }
  }
}
 
/**
 * Normalize CIAContext so defectionProbability is in [0, 1].
 *
 * risk-analysis.ts multiplies it by 100, so out-of-range values can
 * explode scores. Expected input formats:
 * - (0, 1] — already a proper probability fraction; kept as-is.
 *            Note: exactly 1.0 is treated as 100% (not as 1% whole-percent).
 * - (1, ∞) — treated as a whole-percent (loadCIAContext returns min 5,
 *             e.g. 50 means 50% → normalized to 0.5); clamped to 1.
 * - Non-finite or ≤ 0 — coerced to 0 (no defection risk).
 */
export function normalizedCIAContext(ctx: CIAContext): CIAContext {
  const defProb = ctx.coalitionStability?.defectionProbability;
  Iif (typeof defProb !== 'number') return ctx;
 
  let normalized: number;
  if (!Number.isFinite(defProb) || defProb <= 0) {
    // Non-finite or non-positive: no defection risk.
    normalized = 0;
  } else if (defProb <= 1) {
    // Already a fraction in (0, 1]: keep as-is (1.0 = 100% probability).
    normalized = defProb;
  } else {
    // Whole-percent value (e.g. loadCIAContext min 5): convert to fraction and clamp.
    normalized = Math.min(1, defProb / 100);
  }
 
  if (normalized === defProb) return ctx;
  return {
    ...ctx,
    coalitionStability: {
      ...ctx.coalitionStability!,
      defectionProbability: normalized,
    },
  };
}