All files / scripts party-variants.ts

100% Statements 12/12
100% Branches 4/4
100% Functions 1/1
100% Lines 12/12

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63                              2x                                     64x   64x 3x     61x     488x   751x       751x       751x 244x 244x         61x    
/**
 * @module Intelligence/PartyAnalysis
 * @description Swedish political party name normalization and mention extraction.
 * Bounded context: Political Entities
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import type { PartyCode, PartyVariantMap } from './types/party.js';
 
/**
 * Canonical mapping from party codes to their known name variants.
 * Used for normalizing different textual references to the same party.
 */
export const PARTY_VARIANTS: PartyVariantMap = {
  S: ['Socialdemokraterna', 'S'],
  M: ['Moderaterna', 'M'],
  SD: ['Sverigedemokraterna', 'SD'],
  V: ['Vänsterpartiet', 'V'],
  MP: ['Miljöpartiet', 'MP'],
  C: ['Centerpartiet', 'C'],
  L: ['Liberalerna', 'L'],
  KD: ['Kristdemokraterna', 'KD'],
} as const;
 
/**
 * Extract unique party mentions from HTML content.
 * Uses Unicode-aware regex boundaries for proper word detection across scripts.
 *
 * @param html - HTML content to search for party references
 * @returns Set of canonical party codes found in the content
 */
export function extractPartyMentions(html: string | null | undefined): Set<PartyCode> {
  const parties = new Set<PartyCode>();
 
  if (!html) {
    return parties;
  }
 
  for (const [canonicalCode, variants] of Object.entries(PARTY_VARIANTS) as Array<
    [PartyCode, readonly string[]]
  >) {
    for (const variant of variants) {
      // Escape special regex characters in variant
      const escapedVariant = variant.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
 
      // Unicode-aware non-letter/non-number boundary for ALL variants.
      // \b doesn't work well with non-ASCII (ä, ö, å) so we use [^\p{L}\p{N}].
      const pattern = new RegExp(
        `(?:^|[^\\p{L}\\p{N}])${escapedVariant}(?=$|[^\\p{L}\\p{N}])`,
        'ui',
      );
      if (pattern.test(html)) {
        parties.add(canonicalCode);
        break;
      }
    }
  }
 
  return parties;
}