Source: scripts/party-variants.js

/**
 * @module Intelligence/PartyAnalysis
 * @category Intelligence Operations / Supporting Infrastructure
 * @name Party Variants - Political Party Name Normalization Framework
 * 
 * @description
 * Essential party name normalization utility providing canonical mappings of Swedish
 * political party names to their abbreviated codes. Prevents double-counting and
 * analytical bias when multiple party name variants appear in the same content.
 * Critical infrastructure for accurate party perspective analysis and coalition dynamics tracking.
 * 
 * Political Party Context (Swedish Riksdag - 8 parties):
 * - Socialdemokraterna (S): Left-wing social democratic party
 * - Moderaterna (M): Right-wing conservative/liberal party
 * - Sverigedemokraterna (SD): Right-wing populist/nationalist party
 * - Vänsterpartiet (V): Far-left communist party
 * - Miljöpartiet (MP): Green party (left-wing)
 * - Centerpartiet (C): Centrist/rural representation party
 * - Liberalerna (L): Classical liberal party
 * - Kristdemokraterna (KD): Christian democratic party
 * 
 * Party Naming Challenges:
 * Articles often reference parties by multiple name variants (full Swedish name,
 * English translation, formal abbreviation). Example: "Moderaterna" = "Moderate Party" = "M".
 * Without normalization, party mention counting becomes inaccurate, leading to:
 * - Overstated party perspective counts (same party counted multiple times)
 * - Analytical bias (some parties may use translated names more frequently)
 * - Quality metrics misalignment (perspective diversity falsely inflated)
 * 
 * Normalization Strategy:
 * - Maps all party name variants to single canonical party code (S, M, SD, V, MP, C, L, KD)
 * - Prevents double-counting when both full names and abbreviations appear
 * - Ensures consistent party perspective analysis across content
 * - Enables reliable coalition analysis based on party groupings
 * 
 * Core Data Structure:
 * PARTY_VARIANTS object maps canonical codes to arrays of name variants:
 * {
 *   'S': ['Socialdemokraterna', 'S'],
 *   'M': ['Moderaterna', 'M'],
 *   'SD': ['Sverigedemokraterna', 'SD'],
 *   // ... etc for all 8 parties
 * }
 * 
 * Text Matching Implementation:
 * extractPartyMentions() function implements robust text search:
 * - Uses Unicode-aware regex boundaries (\p{L}\p{N}) for proper word detection
 * - Handles non-ASCII characters: ä, ö, å in Swedish names
 * - Prevents false matches: searching "M" doesn't match "Moderaterna"'s M
 * - Searches HTML content directly (works with generated articles)
 * - Returns Set<string> of canonical party codes found
 * 
 * Integration Usage Across Codebase:
 * - article-quality-enhancer.js: Counts unique parties for perspective diversity metric
 * - validate-evening-analysis.js: Extracts party mentions for analytical scoring
 * - news-evening-analysis.test.js: Tests party mention extraction accuracy
 * - Intelligence dashboards: Party affiliation tracking and coalition analysis
 * 
 * Intelligence Applications:
 * - Coalition dynamics analysis: Track party alliances and opposition blocs
 * - Perspective diversity measurement: Ensure balanced party coverage
 * - Political polarization tracking: Identify party positioning shifts
 * - Media bias detection: Identify systematic under/over-coverage of parties
 * - Electoral analysis: Monitor party popularity and messaging
 * 
 * Unicode Handling for Swedish Characters:
 * Pattern: (?:^|[^\p{L}\p{N}])VARIANT(?=$|[^\p{L}\p{N}])
 * - \p{L}: Unicode letter (handles ä, ö, å, and all other scripts)
 * - \p{N}: Unicode number
 * - Prevents matching party codes inside word boundaries
 * - Works across all languages despite Swedish character specifics
 * 
 * Boolean Flags for Language Variety:
 * If future expansion includes English translations or other languages:
 * - S: ['Socialdemokraterna', 'S', 'Social Democrats'] (if needed)
 * - M: ['Moderaterna', 'M', 'Moderate Party']
 * - Extension maintains backward compatibility
 * 
 * Data Protection:
 * - No personal identifiers stored or processed
 * - Operates on published party entity references only
 * - Complies with GDPR by not identifying individual politicians
 * - Audit trail of political party mentions in articles
 * 
 * ISMS Compliance:
 * - ISO 27001:2022 A.12.2.1 (change log maintenance - version control)
 * - NIST CSF 2.0 PR.DS-1 (data classification - public content)
 * 
 * Functions:
 * - extractPartyMentions(html): Searches HTML content for party references
 *   Input: HTML string from article
 *   Output: Set<string> of canonical party codes (e.g., new Set(['S', 'M', 'SD']))
 * 
 * Usage Example:
 *   import { extractPartyMentions } from './party-variants.js';
 *   const parties = extractPartyMentions(articleHtml);
 *   const uniquePartyCount = parties.size;  // 0-8
 *   const hasBalancedCoverage = uniquePartyCount >= 4;  // Min 4 parties threshold
 * 
 * @intelligence Core utility for accurate party perspective analysis
 * @osint Analyzes public political party references from news coverage
 * @risk Inaccurate normalization leads to biased party perspective counts
 * @gdpr No personal data processing (political entity references only)
 * @security Case-sensitive matching for proper abbreviation detection
 * 
 * @author Hack23 AB (Political Intelligence Team)
 * @license Apache-2.0
 * @version 1.8.0
 * @see article-quality-enhancer.js (primary consumer)
 * @see validate-evening-analysis.js (party mention validation)
 * @see Swedish political party system structure
 */

export const PARTY_VARIANTS = {
  S: ['Socialdemokraterna', 'S'],
  M: ['Moderaterna', 'M'],
  SD: ['Sverigedemokraterna', 'SD'],
  V: ['Vänsterpartiet', 'V'],
  MP: ['Miljöpartiet', 'MP'],
  C: ['Centerpartiet', 'C'],
  L: ['Liberalerna', 'L'],
  KD: ['Kristdemokraterna', 'KD']
};

/**
 * Extract unique party mentions from HTML content
 * @param {string} html - HTML content to search
 * @returns {Set<string>} - Set of canonical party codes found
 */
export function extractPartyMentions(html) {
  const parties = new Set();
  
  if (!html) {
    return parties;
  }
  
  for (const [canonicalCode, variants] of Object.entries(PARTY_VARIANTS)) {
    for (const variant of variants) {
      // Escape special regex characters in variant
      const escapedVariant = variant.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
      
      // Use Unicode-aware non-letter/non-number boundary for ALL variants.
      // This handles HTML tags (>), parentheses, punctuation, whitespace etc.
      // \b doesn't work well with non-ASCII (ä, ö, å) so we use [^\p{L}\p{N}].
      // For short codes (S, M, V, C, L, MP, SD, KD), this prevents matching
      // inside words like "Sörling", "USA", or "MP" when looking for "M".
      const pattern = new RegExp(
        `(?:^|[^\\p{L}\\p{N}])${escapedVariant}(?=$|[^\\p{L}\\p{N}])`, 'ui'
      );
      if (pattern.test(html)) {
        parties.add(canonicalCode);
        break;
      }
    }
  }
  
  return parties;
}