data-transformers helpers.ts

2.32% Statements 4/172
7.51% Branches 16/213
4.54% Functions 1/22
1.89% Lines 3/158
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80x
10x
 
10x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  /**
 * @module data-transformers/helpers
 * @description Low-level utility functions for the data transformation
 * pipeline: URL sanitisation, Swedish-language span generation, date
 * formatting, text cleaning, and document metadata helpers.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { escapeHtml } from '../html-utils.js';
import type { Language } from '../types/language.js';
import type { ContentLabelSet, CommitteeName } from '../types/content.js';
import { LOCALE_MAP, COMMITTEE_NAMES, CONTENT_LABELS } from './constants.js';
import type { RawCalendarEvent, RawDocument, CIAContext } from './types.js';
import { cleanSummaryForDisplay } from './text-cleaner.js';
 
/**
 * Sanitize a URL for safe use in href attributes.
 * Rejects javascript:, data:, vbscript: schemes and returns '#' for invalid URLs.
 * Also escapes HTML attribute characters in the URL.
 */
export function sanitizeUrl(url: string | undefined | null): string {
  if (!url || typeof url !== 'string') return '#';
  const trimmed = url.trim();
  // Block dangerous schemes
  if (/^(javascript|data|vbscript):/i.test(trimmed)) return '#';
  // Only allow http, https, and relative URLs
  if (/^[a-z]+:/i.test(trimmed) && !/^https?:/i.test(trimmed)) return '#';
  // Escape HTML attribute characters
  return trimmed.replace(/&/g, '&amp;').replace(/"/g, '&quot;').replace(/'/g, '&#x27;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
}
 
/**
 * Emit a Swedish-language span.
 *
 * The span always carries both the `lang="sv"` accessibility attribute AND
 * `data-translate="true"` so that `translateSwedishContent()` (in
 * `translation-dictionary.ts`) can locate every Swedish phrase, look it up
 * in the per-language dictionary, and replace or clean the marker before the
 * article is written to disk.
 *
 * - **SV articles**: the marker lets validation tooling verify original text
 *   is present; `translateSwedishContent()` strips the marker but keeps the
 *   Swedish text unchanged.
 * - **Non-SV articles**: `translateSwedishContent()` attempts dictionary
 *   translation via `translatePhrase()` and removes the marker regardless
 *   of whether a match was found, so no `data-translate` attributes remain
 *   in the final HTML.
 *
 * @param escapedText - Already HTML-escaped text content
 * @param _lang       - Target article language (kept only for backward
 *                      compatibility; currently not used by this function)
 *
 * ## Opt-in strict mode
 *
 * When the environment variable `SVSPAN_STRICT=1` is set, this function
 * throws when called for a non-Swedish, non-English target language. This
 * forces the architectural split described in
 * `analysis/agentic-workflow-quality-plan §P0-1`: scripts must never emit
 * untranslated Swedish into a target-language article — AI translation of
 * titles/summaries must happen upstream. Strict mode is opt-in so that
 * existing test suites continue to pass unchanged; agentic workflows will
 * enable it via their `runtimes.env` block once the upstream translation
 * contract (see `news-translate.md`) is in place.
 */
export function svSpan(escapedText: string, _lang: Language | string): string {
  if (process.env.SVSPAN_STRICT === '1') {
    const lang = String(_lang);
    if (lang !== 'sv' && lang !== 'en') {
      throw new Error(
        `svSpan() called for lang=${lang} — AI translation required upstream. ` +
        `Target-language articles must have titles and summaries translated by ` +
        `the AI (see the aggregation-workflow prompts and news-translate.md) ` +
        `before reaching article generators; see ` +
        `analysis/agentic-workflow-quality-plan §P0-1/P0-2.`
      );
    }
  }
  // NOTE: `_lang` is intentionally unused in non-strict mode and retained
  // solely so existing call sites do not need to be updated; all spans are
  // marked as Swedish.
  return `<span data-translate="true" lang="sv">${escapedText}</span>`;
}
 
/**
 * Get localized label with fallback to English
 */
export function L(lang: Language | string, key: string): ContentLabelSet[keyof ContentLabelSet] {
  const langLabels = CONTENT_LABELS[lang as Language];
  const value = langLabels?.[key as keyof ContentLabelSet];
  if (value !== undefined) return value;
  return CONTENT_LABELS.en[key as keyof ContentLabelSet];
}
 
/**
 * Check if date is today
 */
export function isTodayDate(date: Date): boolean {
  const today = new Date();
  return date.getDate() === today.getDate() &&
         date.getMonth() === today.getMonth() &&
         date.getFullYear() === today.getFullYear();
}
 
/**
 * Format day name (Monday, Tuesday, etc.) using Intl for all 14 languages
 */
export function formatDayName(date: Date, lang: Language | string = 'en'): string {
  const locale = LOCALE_MAP[lang] || lang;
  try {
    return new Intl.DateTimeFormat(locale, { weekday: 'long' }).format(date);
  } catch {
    return new Intl.DateTimeFormat('en-GB', { weekday: 'long' }).format(date);
  }
}
 
/**
 * Format day label (e.g., "February 10 - Monday") using Intl for all 14 languages
 */
export function formatDayLabel(date: Date, lang: Language | string = 'en'): string {
  const locale = LOCALE_MAP[lang] || lang;
  try {
    const dayName = formatDayName(date, lang);
    const monthDay = new Intl.DateTimeFormat(locale, { month: 'long', day: 'numeric' }).format(date);
    return `${monthDay} - ${dayName}`;
  } catch {
    const dayName = formatDayName(date, 'en');
    const monthDay = new Intl.DateTimeFormat('en-GB', { month: 'long', day: 'numeric' }).format(date);
    return `${monthDay} - ${dayName}`;
  }
}
 
/**
 * Determine if event is high priority
 */
export function isHighPriority(event: RawCalendarEvent): boolean {
  const title = (event.title || event.rubrik || '').toLowerCase();
  return (
    title.includes('pm') ||
    title.includes('prime minister') ||
    title.includes('statsminister') ||
    title.includes('vote') ||
    title.includes('votering') ||
    title.includes('eu') ||
    title.includes('summit')
  );
}
 
/**
 * Parse author and party from raw Swedish motion text.
 * Handles "av Fredrik Olovsson m.fl. (S)" and similar patterns.
 */
export function parseMotionAuthorParty(text: string): { author: string; party: string } | null {
  const m = text.match(/\bav\s+([^(]+?)\s+\(([A-ZÅÄÖ]{1,5})\)/u);
  if (m) return { author: m[1].trim().replace(/\s+/g, ' '), party: m[2] };
  return null;
}
 
/**
 * Clean raw Swedish motion notis text into a readable subject.
 * Strips "Motion till riksdagen XXXX av AUTHOR (PARTY) med anledning av..."
 * and truncates at "Förslag till riksdagsbeslut".
 */
export function cleanMotionText(raw: string): string {
  // Minimum cleaned text length before falling back to raw; max excerpt lengths
  const MIN_CLEANED = 20;
  const MAX_CLEANED = 300;
  const MAX_RAW_FALLBACK = 200;
  // Truncate at formal ballot section
  let text = raw.replace(/Förslag till riksdagsbeslut[\s\S]*/i, '').trim();
  // Strip leading "Motion till riksdagen YYYY/YY:NNN av AUTHOR (PARTY) " prefix
  text = text.replace(/^Motion till riksdagen\s+\S+\s+av\s+[^(]+\([A-ZÅÄÖ]{1,5}\)\s*/i, '').trim();
  // Strip "med anledning av prop. YYYY/YY:NNN " prefix
  text = text.replace(/^med anledning av prop\.\s+\S+\s*/i, '').trim();
  return text.length > MIN_CLEANED ? text.slice(0, MAX_CLEANED) : raw.slice(0, MAX_RAW_FALLBACK);
}
 
/**
 * Detect when a text string is an MP/politician profile page excerpt rather than
 * document content. Returns true for texts that begin with Swedish MP-status phrases
 * or contain profile-specific markers such as:
 *   - "Tjänstgörande riksdagsledamot …"   (active MP)
 *   - "Tidigare riksdagsledamot …"        (former MP)
 *   - "Avgången riksdagsledamot …"        (resigned MP)
 *   - "Tillgänglig ersättare …"           (substitute MP)
 *   - "Tjänstgörande ersättare …"         (active substitute)
 *   - "Tidigare ersättare …"              (former substitute)
 *   - "Tjänstgörande statsrådsersättare"  (acting minister substitute)
 *   - "Tidigare statsråd …"              (former minister)
 *   - "Tidigare statsminister …"          (former PM)
 *   - "Inga uppdrag"                      (no assignments)
 *   - "Avgången …"                        (resigned)
 *   - "Avliden YYYY-MM-DD …"              (deceased MP)
 *
 * This data comes from the riksdag API's person/ledamot profile pages, and must never
 * appear in article document-entry content.
 */
export function isPersonProfileText(text: string): boolean {
  if (!text) return false;
  const trimmed = text.trimStart();
  // Ordered from most specific to least; any match → it is a person profile excerpt
  return (
    /^Tjänstgörande riksdagsledamot/u.test(trimmed) ||
    /^Tidigare riksdagsledamot/u.test(trimmed) ||
    /^Avgången riksdagsledamot/u.test(trimmed) ||
    /^Tillgänglig ersättare/u.test(trimmed) ||
    /^Tjänstgörande ersättare/u.test(trimmed) ||
    /^Tidigare ersättare/u.test(trimmed) ||
    /^Tjänstgörande statsrådsersättare/u.test(trimmed) ||
    /^Tidigare statsråd/u.test(trimmed) ||
    /^Tidigare statsminister/u.test(trimmed) ||
    /^Inga uppdrag/u.test(trimmed) ||
    /^Avgången/u.test(trimmed) ||
    // Deceased: "Avliden YYYY-MM-DD ..."
    /^Avliden\s+\d{4}-\d{2}-\d{2}/u.test(trimmed) ||
    // Contains riksdag email address — always a profile page
    /[a-zA-Z0-9._%+-]+@riksdagen\.se/u.test(trimmed) ||
    // Contains "Aktuella uppdrag Riksdagsledamot" — profile header
    /Aktuella uppdrag\s+Riksdagsledamot/u.test(trimmed)
  );
}
 
/**
 * Build a descriptive proposition summary from the ministry organ.
 * Returns a ministry-specific framing sentence.
 */
export function propSummaryFromOrgan(organ: string, lang: Language | string): string {
  const ministryMap: Record<string, { sv: string; en: string }> = {
    Justitiedepartementet:    { sv: 'Justitiedepartementets förslag rör rättsliga förändringar.', en: 'This Justice Ministry proposal amends existing legal framework.' },
    Finansdepartementet:      { sv: 'Finansdepartementets förslag påverkar statsbudget eller finansreglering.', en: 'This Finance Ministry proposal has fiscal or budgetary implications.' },
    Försvarsdepartementet:    { sv: 'Försvarsdepartementets förslag rör försvars- eller säkerhetspolitik.', en: 'This Defence Ministry proposal concerns national security or defence posture.' },
    Utbildningsdepartementet: { sv: 'Utbildningsdepartementets förslag berör skolsystem eller forskning.', en: 'This Education Ministry proposal affects schools, universities or research funding.' },
    Socialdepartementet:      { sv: 'Socialdepartementets förslag rör välfärd eller socialpolitik.', en: 'This Social Affairs Ministry proposal affects welfare entitlements or social services.' },
    Miljödepartementet:       { sv: 'Klimat- och miljödepartementets förslag rör klimat- eller miljöpolitik.', en: 'This Climate and Environment Ministry proposal targets emissions or ecological regulation.' },
    'Klimat- och miljödepartementet': { sv: 'Klimat- och miljödepartementets förslag rör klimat- eller miljöpolitik.', en: 'This Climate and Environment Ministry proposal targets emissions or ecological regulation.' },
    'Klimat- och näringslivsdepartementet': { sv: 'Klimat- och näringslivsdepartementets förslag rör klimat- och näringspolitik.', en: 'This Climate and Enterprise Ministry proposal addresses both environmental and industrial policy.' },
    Utrikesdepartementet:     { sv: 'Utrikesdepartementets förslag rör utrikespolitik eller internationella relationer.', en: 'This Foreign Affairs Ministry proposal concerns international relations or Sweden’s global obligations.' },
    Infrastrukturdepartementet: { sv: 'Infrastrukturdepartementets förslag rör transport eller samhällsinfrastruktur.', en: 'This Infrastructure Ministry proposal affects transport networks or public utilities.' },
  };
  const entry = ministryMap[organ];
  if (!entry) return '';
  return lang === 'sv' ? entry.sv : entry.en;
}
 
/**
 * Generate enhanced summary from document metadata when summary field is missing
 * Uses document type, subtype, organ, and other metadata to create informative placeholder
 */
export function generateEnhancedSummary(doc: RawDocument, type: string, lang: Language | string): string {
  // For motions/interpellations: clean raw Swedish notis text before returning.
  // Note: cleanMotionText() only handles motion-specific boilerplate ("Motion till riksdagen",
  // "Förslag till riksdagsbeslut"); interpellation text without those phrases is returned as-is.
  if ((type === 'motion' || type === 'interpellation') && (doc.summary || doc.notis)) {
    const raw = (doc.summary || doc.notis || '');
    // Skip person-profile data (e.g. "Tjänstgörande riksdagsledamot...", "Avliden 2011-09-20...")
    if (!isPersonProfileText(raw)) {
      if (raw.includes('Motion till riksdagen') || raw.includes('Förslag till riksdagsbeslut')) {
        // Apply prose-hygiene filter on top of the motion-specific cleaner to
        // strip any residual dok-id prefix, `#page_N` anchors, `&nbsp;` noise,
        // or CSS rule fragments (§P0-4).
        return cleanSummaryForDisplay(cleanMotionText(raw));
      }
      return cleanSummaryForDisplay(raw);
    }
  }
 
  // If we have a real summary or notis (not person profile data), use it as-is
  if (doc.summary || doc.notis) {
    const text = doc.summary || doc.notis || '';
    if (!isPersonProfileText(text)) {
      // §P0-4: run the prose-hygiene filter so CSS rule fragments, dok-id
      // metadata prefixes, `#page_N` anchors, and `&nbsp;` noise never reach
      // article HTML. Upstream {@link stripRiksdagRawDump} handles the big-block
      // cases in extracted document text; this catches residual noise in
      // summary/notis fields that bypass `extractKeyPassage`.
      return cleanSummaryForDisplay(text);
    }
  }
 
  // Generate enhanced summary based on metadata
  const organ = doc.organ || doc.committee;
  const subtyp = doc.subtyp || doc.subtype;
  const doktyp = doc.doktyp || doc.documentType;
 
  // Build contextual summary based on available metadata
  const parts: string[] = [];
 
  if (type === 'report' && organ) {
    const labelVal = L(lang, 'committeeReport');
    parts.push(`${organ} ${typeof labelVal === 'string' ? labelVal : ''}`);
    if (subtyp) {
      const onVal = L(lang, 'on');
      parts.push(`${typeof onVal === 'string' ? onVal : ''} ${subtyp}`);
    }
  } else if (type === 'proposition') {
    // Try ministry-specific framing first
    const ministrySummary = organ ? propSummaryFromOrgan(organ, lang) : '';
    if (ministrySummary) {
      return ministrySummary;
    }
    const propLabel = L(lang, 'governmentProposition');
    parts.push(typeof propLabel === 'string' ? propLabel : '');
    if (organ) {
      const referredVal = L(lang, 'referredTo');
      parts.push(`${typeof referredVal === 'string' ? referredVal : ''} ${organ}`);
    }
  } else if (type === 'motion') {
    const author = (doc.intressent_namn !== 'Unknown' ? doc.intressent_namn : null) || doc.author;
    const party = doc.parti !== 'Unknown' ? doc.parti : undefined;
    if (author && party) {
      const motionByVal = L(lang, 'motionBy');
      parts.push(`${typeof motionByVal === 'string' ? motionByVal : ''} ${author} (${party})`);
    } else if (author) {
      const motionByVal = L(lang, 'motionBy');
      parts.push(`${typeof motionByVal === 'string' ? motionByVal : ''} ${author}`);
    } else {
      const parlMotion = L(lang, 'parliamentaryMotion');
      parts.push(typeof parlMotion === 'string' ? parlMotion : '');
    }
    if (subtyp) {
      const onVal = L(lang, 'on');
      parts.push(`${typeof onVal === 'string' ? onVal : ''} ${subtyp}`);
    }
  } else if (type === 'interpellation') {
    // NOTE: do NOT prefix with author/party here — renderInterpellationEntry()
    // already renders a dedicated "Filed by:" line, so including it in the
    // summary would duplicate the attribution.
    // Include target minister (mottagare) if available
    // NOTE: do NOT escapeHtml here — callers escape the returned summary string
    if (doc.mottagare) {
      parts.push(`→ ${doc.mottagare}`);
    }
    if (subtyp) {
      const onVal = L(lang, 'on');
      parts.push(`${typeof onVal === 'string' ? onVal : ''} ${subtyp}`);
    }
  }
 
  // Add document type information if useful
  if (doktyp && doktyp !== type) {
    parts.push(`(${doktyp})`);
  }
 
  // Fallback to default if no useful metadata
  if (parts.length === 0) {
    const fallback = type === 'report' ? L(lang, 'reportDefault') :
           type === 'proposition' ? L(lang, 'propDefault') :
           type === 'interpellation' ? L(lang, 'interpellationDefault') :
           L(lang, 'motionDefault');
    return typeof fallback === 'string' ? fallback : '';
  }
 
  return parts.join(' ') + '.';
}
 
/**
 * Get human-readable committee name from code
 */
export function getCommitteeName(code: string | undefined, lang: Language | string): string {
  if (!code) {
    const unknownVal = L(lang, 'unknown');
    return typeof unknownVal === 'string' ? unknownVal : 'Unknown';
  }
  if (code === 'unknown') {
    const otherVal = L(lang, 'otherCommittee');
    return typeof otherVal === 'string' ? otherVal : 'Other committees';
  }
  const entry: CommitteeName | undefined = COMMITTEE_NAMES[code];
  if (!entry) return code;
  // Use Swedish name for sv, English for all others (other languages get translated via data-translate)
  return lang === 'sv' ? entry.sv : entry.en;
}
 
/**
 * Strip the Riksdag raw-dump prefix and embedded CSS that the MCP
 * `get_dokument_innehall` `text` field (stored as `fullContent` in the pipeline)
 * prepends to many documents.
 *
 * Typical dump shape (whitespace-separated, no tags):
 *   `5287561 HD03242 2025/26 242 prop prop prop Proposition 2025/26:242
 *    Proposition Proposition Landsbygds- och infrastrukturdepartementet MJU
 *    242 0 2026-04-16 00:00:00 2026-04-16 15:24:08 2026-04-16 00:00:00
 *    <title> html-ec prop-RIM <uuid>
 *    body {margin-top: 0px;margin-left: 0px;}
 *    #page_1 {position:relative; overflow: hidden; ...} ...`
 *
 * Without cleanup these metadata and CSS fragments render as visible text in
 * article document entries (see `news/2026-04-18-weekly-review-*.html`, fixed
 * by the companion content patch). All data sources and article generators
 * that pass Riksdag document text through `extractKeyPassage` are now
 * protected by this helper.
 */
export function stripRiksdagRawDump(text: string): string {
  if (!text) return text;
  let s = text;
 
  // 1. Remove embedded CSS rule blocks ("selector { properties }"). Only strip
  //    blocks whose body looks CSS-like so we never touch legitimate Swedish
  //    prose that happens to contain braces. No nesting in Riksdag dumps.
  //
  //    The outer pattern is bounded ({0,300} selector, {0,1000} body) to prevent
  //    catastrophic backtracking on pathological inputs. `[^{}]` further guarantees
  //    linear-time matching because the inner class cannot overlap the delimiters.
  //    A capture group isolates the `{...body...}` so the CSS signature is tested
  //    only against the block body — not the selector / surrounding prose — which
  //    avoids false positives on Swedish text like `"prisökning: 10 procent"`
  //    that precedes an unrelated brace pair.
  //
  //    `CSS_PROPERTY_SIGNATURE` recognises common CSS property syntax patterns:
  //      - `: <digit>` (numeric value assignments, e.g. `top: 0`, `z-index: -1`)
  //      - CSS length units (`px`, `em`, `rem`) as whole words
  //      - `%;` (percent value terminator)
  //      - CSS hex colours (`#abc` or `#aabbcc`)
  //      - Known CSS property names followed by `:`
  const CSS_PROPERTY_SIGNATURE = /(?::\s*-?\d|\b(?:px|em|rem)\b|%\s*;|#[0-9a-f]{3,6}\b|position\s*:|margin\s*:|padding\s*:|overflow\s*:|width\s*:|height\s*:|top\s*:|left\s*:|z-index\s*:|display\s*:|font-|border\s*:)/i;
  s = s.replace(/[^{}]{0,300}(\{[^{}]{0,1000}\})/g, (m, body: string) =>
    CSS_PROPERTY_SIGNATURE.test(body) ? ' ' : m
  );
 
  // 2. Detect Riksdag metadata prefix. Always begins with: numeric doc-id,
  //    HD-<dok_id>, and a riksmöte (YYYY/YY).
  const metaPrefix = /^\s*\d{6,}\s+HD\S+\s+\d{4}\/\d{2}\s+/;
  if (metaPrefix.test(s)) {
    // Preferred boundary: "html-ec <doktype>-RIM <UUID>" marker — strip up to and including it.
    const rimRegex = /^[\s\S]*?html-ec\s+\S+-RIM\s+[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\s*/i;
    if (rimRegex.test(s)) {
      s = s.replace(rimRegex, '');
    } else {
      // Fallback 1: strip up to and including the first bare UUID if it is
      // still within the metadata header window (first ~1.5k chars).
      const uuidIdx = s.search(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/i);
      if (uuidIdx > -1 && uuidIdx < 1500) {
        s = s
          .slice(uuidIdx)
          .replace(/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\s*/i, '');
      } else {
        // Fallback 2: strip the fixed metadata header up to the last
        // `YYYY-MM-DD HH:MM:SS` timestamp that appears within the first
        // 800 chars, then any additional repeated timestamps.
        const shortHeader = /^\s*\d{6,}\s+HD\S+\s+\d{4}\/\d{2}[\s\S]{0,800}?\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+/;
        const m = s.match(shortHeader);
        if (m) {
          s = s.slice(m[0].length);
          while (/^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+/.test(s)) {
            s = s.replace(/^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+/, '');
          }
        }
      }
    }
  }
 
  return s.replace(/\s+/g, ' ').trim();
}
 
/**
 * Extract the most analytically useful excerpt from full document text.
 * Returns first substantive paragraph (skips short headings/metadata lines).
 */
export function extractKeyPassage(fullText: string | undefined, maxChars = 600): string {
  if (!fullText) return '';
  // Strip HTML tags if present
  let plain = fullText.replace(/<[^>]+>/g, ' ');
  // Strip markdown links — keep link text, remove URL: [text](url) → text
  plain = plain.replace(/\[([^\]]*)\]\([^)]+\)/g, '$1');
  // Strip bare URLs (http/https)
  plain = plain.replace(/https?:\/\/[^\s)]+/g, '');
  // Strip Riksdag raw-dump prefix (metadata header + embedded CSS rule blocks)
  plain = stripRiksdagRawDump(plain);
  // Collapse whitespace
  plain = plain.replace(/\s+/g, ' ').trim();
  if (plain.length <= maxChars) return plain;
  // Find a sentence boundary near maxChars
  const cut = plain.lastIndexOf('.', maxChars);
  return cut > 100 ? plain.slice(0, cut + 1) : plain.slice(0, maxChars) + '…';
}
 
/**
 * Normalise a raw `parti` field to a canonical party key.
 * Maps missing, empty, or any capitalisation of 'unknown' to 'other'.
 * Used in both generateMotionsContent (party grouping) and
 * generateOppositionStrategySection so both sections treat the sentinel
 * identically regardless of capitalisation.
 */
export function normalizePartyKey(parti: unknown): string {
  const raw = typeof parti === 'string' ? parti.trim() : '';
  return !raw || raw.toLowerCase() === 'unknown' ? 'other' : raw;
}
 
/**
 * Look up party motion success rate from CIA context.
 * Returns null when data is unavailable so callers can skip the annotation.
 */
export function partyMotionSuccessRate(party: string | undefined, cia: CIAContext | undefined): number | null {
  if (!cia || !party) return null;
  const p = cia.partyPerformance.find(x => x.id === party || x.partyName.toLowerCase().startsWith(party.toLowerCase()));
  return p ? p.metrics.successRate : null;
}
 
/**
 * Format a document publication date for display.
 * Returns an HTML string like
 * `<span class="doc-date"><strong>Published:</strong> <time datetime="2026-03-04">2026-03-04</time></span>`
 * using the localized "Published" label, or empty string if datum is missing.
 */
export function formatDocumentDate(doc: RawDocument, lang: Language | string): string {
  const datum = doc.datum;
  if (!datum) return '';
  const publishedLabel = L(lang, 'published');
  return `<span class="doc-date"><strong>${escapeHtml(String(publishedLabel))}:</strong> <time datetime="${escapeHtml(datum)}">${escapeHtml(datum)}</time></span>`;
}
 
/**
 * Filter documents to only include those published within a given number of days.
 * Documents without a `datum` field are kept (benefit of the doubt).
 *
 * @param docs - Array of raw documents
 * @param maxAgeDays - Maximum age in days (default 30)
 * @returns Filtered array containing only fresh documents
 */
export function filterFreshDocuments(docs: RawDocument[], maxAgeDays = 30): RawDocument[] {
  // Normalize cutoff to midnight UTC so day-based threshold is consistent
  const now = new Date();
  const cutoffMs = Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) - maxAgeDays * 24 * 60 * 60 * 1000;
  return docs.filter(doc => {
    if (!doc.datum) {
      // keep documents without dates (benefit of the doubt)
      return true;
    }
    // Interpret datum (YYYY-MM-DD) as midnight UTC for deterministic comparison
    const docTime = Date.parse(`${doc.datum}T00:00:00Z`);
    if (Number.isNaN(docTime)) {
      // If the date cannot be parsed, keep the document rather than dropping it
      return true;
    }
    return docTime >= cutoffMs;
  });
}