JSDoc: Source: scripts/extract-vocabulary.js

#!/usr/bin/env node

/**
 * @module Intelligence/Terminology
 * @category Intelligence Operations / Supporting Infrastructure
 * @name Vocabulary Extraction - Political Terminology Pattern Analysis
 * 
 * @description
 * Advanced terminology extraction system analyzing translated news articles across
 * all 14 supported languages to identify and catalog political terminology patterns.
 * Supports intelligence operatives in understanding how political concepts translate
 * across linguistic and cultural contexts.
 * 
 * Core Mission:
 * Automatic extraction of political terminology from published news articles,
 * enabling vocabulary enrichment across 14 language variants. Identifies key political
 * terms, party names, committee references, legislative concepts, and institutional
 * terminology used in Riksdag coverage. Supports linguistic analysis for translation
 * quality assurance and political concept comparison across Nordic/European contexts.
 * 
 * Supported Languages (14):
 * - Latin Scripts: English (en), Swedish (sv), Danish (da), Norwegian (no), Finnish (fi),
 *   German (de), French (fr), Spanish (es), Dutch (nl)
 * - Non-Latin Scripts: Arabic (ar - RTL), Hebrew (he - RTL), 
 *   Japanese (ja - Kanji/Hiragana/Katakana), Korean (ko - Hangul), Chinese (zh - Hanzi)
 * 
 * Terminology Categories Extracted:
 * - Political titles and roles (e.g., "Riksdagsledamot", "Statsminister", "Minister")
 * - Committee references (e.g., "Försvarsutskottet", "Finansutskottet")
 * - Party mentions and abbreviations (S, M, SD, V, MP, C, L, KD)
 * - Legislative terms (propositioner, motioner, betänkanden, skrivelser)
 * - Institutional acronyms (EU, NATO, UN, etc.)
 * - Policy domains (försvar, miljö, ekonomi, välfärd)
 * 
 * Extraction Methodology:
 * - Structure-based extraction using HTML element analysis (language-agnostic)
 * - Parses article titles (h3 elements) as primary terminology source
 * - Extracts "What to Watch" and key context headings
 * - Handles "Why This Matters" explanatory sections
 * - Cross-references historical context and related articles
 * - Supports all character encodings (UTF-8 with proper multi-byte handling)
 * 
 * Technical Architecture:
 * - CLI-configurable date filtering for temporal analysis
 * - Comprehensive error reporting with file-level granularity
 * - Support for all Unicode scripts (Latin, CJK, Arabic/Hebrew bidirectional)
 * - Handles HTML entity encoding and special character sequences
 * - Generates JSON output for downstream linguistic analysis
 * 
 * Intelligence Applications:
 * - Terminology pattern analysis for translation consistency
 * - Identification of emerging political concepts and terminology
 * - Cross-language comparison of political discourse
 * - Support for cultural intelligence analysis
 * - Input to automated translation quality assurance
 * - Political terminology reference for analyst briefings
 * 
 * Data Processing Pipeline:
 * - Scans news/ directory for published articles
 * - Filters by date range (--from-date, --to-date parameters)
 * - Extracts terminology from HTML structure
 * - Deduplicates and normalizes extracted terms
 * - Generates comprehensive terminology report
 * - Logs skipped files and processing errors
 * 
 * Output Format:
 * JSON file containing:
 * - titles: Array of extracted h3 titles (primary terminology)
 * - headings: Array of h2 section headings
 * - whyMatters: Content from "Why This Matters" sections
 * - context: Historical context and reference sections
 * - metadata: Extraction statistics and processing info
 * 
 * Usage:
 *   node scripts/extract-vocabulary.js
 *   node scripts/extract-vocabulary.js --from-date=2026-01-01 --to-date=2026-02-01
 *   node scripts/extract-vocabulary.js --lang=sv
 * 
 * Integration Points:
 * - Consumed by automated translation quality assurance systems
 * - Used in terminology glossary maintenance workflows
 * - Feeds political concept mapping for intelligence dashboards
 * - Referenced in editorial consistency validation
 * 
 * GDPR & Data Protection:
 * - Processes only published, public articles
 * - Extracts terminology only (no personal identifiers)
 * - Complies with GDPR Article 5 (transparency)
 * - No storage of personal data in terminology output
 * - Audit trail of extraction dates and processing
 * 
 * Linguistic Considerations:
 * - Preserves non-Latin character scripts exactly as published
 * - Handles right-to-left languages (Arabic, Hebrew) properly
 * - Supports CJK character analysis without segmentation
 * - Respects linguistic conventions for each language
 * 
 * @intelligence Core tool for political terminology analysis across languages
 * @osint Analyzes published public government documents and articles
 * @risk Incomplete terminology extraction may omit emerging political concepts
 * @gdpr No personal data extraction (terminology/concepts only)
 * @security File access restricted to article directory; no system-wide scanning
 * 
 * @author Hack23 AB (Linguistic Intelligence Team)
 * @license Apache-2.0
 * @version 2.5.0
 * @see extract-vocabulary.js (this file)
 * @see political terminology references for each language
 * @see Unicode Standard for multi-script support
 * @see GDPR Article 5 - Principles relating to processing
 */

import { readFileSync, readdirSync } from 'fs';
import { basename, join } from 'path';

const LANGUAGES = {
  en: 'English', sv: 'Swedish', da: 'Danish', no: 'Norwegian', fi: 'Finnish',
  de: 'German', fr: 'French', es: 'Spanish', nl: 'Dutch',
  ar: 'Arabic', he: 'Hebrew', ja: 'Japanese', ko: 'Korean', zh: 'Chinese'
};

// Track skipped files for warning summary
const skippedFiles = [];

/**
 * Extract political terms from HTML content using structure-based approach
 */
function extractTerms(content, lang) {
  const terms = {};
  
  // Extract titles (main political terminology)
  // Handle both plain text and <span> wrapped titles
  const h3Pattern = /<h3>(.*?)<\/h3>/g;
  const h3Matches = [];
  let h3Match;
  while ((h3Match = h3Pattern.exec(content)) !== null) {
    // Strip any inner tags (like <span>) to get clean text
    const cleanText = h3Match[1].replace(/<[^>]+>/g, '').trim();
    if (cleanText) h3Matches.push(cleanText);
  }
  terms.titles = h3Matches.slice(0, 10);
  
  // Extract "What to Watch" heading (any language) - structure-based
  const h2Pattern = /<h2[^>]*>([^<]+)<\/h2>/g;
  const h2Matches = [];
  let h2Match;
  while ((h2Match = h2Pattern.exec(content)) !== null) {
    const text = h2Match[1].trim();
    // Heuristic: "What to Watch" headings often contain these keywords
    if (text.length > 5 && text.length < 100) {
      h2Matches.push(text);
    }
  }
  if (h2Matches.length > 0) {
    // Assume first h2 is "What to Watch" if present
    terms.watchLabel = h2Matches[0];
  }
  
  // Extract structured labels from <strong>…:</strong> (language-agnostic)
  const strongLabelPattern = /<strong>\s*([^:<]+?)\s*:\s*<\/strong>/g;
  const strongLabels = [];
  let strongMatch;
  while ((strongMatch = strongLabelPattern.exec(content)) !== null) {
    const label = strongMatch[1].trim();
    if (label.length > 0 && label.length < 50) {
      strongLabels.push(label);
    }
  }
  
  // Convention: first label is often committee, second is document
  if (strongLabels[0]) terms.committeeLabel = strongLabels[0];
  if (strongLabels[1]) terms.documentLabel = strongLabels[1];
  
  // Extract article type from title
  const titleMatch = content.match(/<h1>([^<]+)<\/h1>/);
  if (titleMatch) terms.mainTitle = titleMatch[1].trim();
  
  return terms;
}

/**
 * Analyze all news articles
 */
function analyzeArticles(directory = 'news', datePrefix = null) {
  const results = {};
  
  for (const lang of Object.keys(LANGUAGES)) {
    results[lang] = {
      language: LANGUAGES[lang],
      code: lang,
      samples: []
    };
  }
  
  try {
    const files = readdirSync(directory).filter(f => {
      if (!f.endsWith('.html')) return false;
      if (datePrefix && !f.includes(datePrefix)) return false;
      return true;
    });
    
    console.log(`\nScanning ${files.length} HTML files in ${directory}/`);
    if (datePrefix) {
      console.log(`Filtering by date prefix: "${datePrefix}"\n`);
    }
    
    for (const file of files) {
      const match = file.match(/-([a-z]{2})\.html$/);
      if (!match) {
        skippedFiles.push({ file, reason: 'No language code in filename' });
        continue;
      }
      
      const lang = match[1];
      if (!results[lang]) {
        skippedFiles.push({ file, reason: `Unknown language code: ${lang}` });
        continue;
      }
      
      try {
        const content = readFileSync(join(directory, file), 'utf-8');
        const terms = extractTerms(content, lang);
        
        // Determine article type
        let articleType = 'general';
        if (file.includes('committee')) articleType = 'committee-reports';
        else if (file.includes('proposition')) articleType = 'propositions';
        else if (file.includes('motion')) articleType = 'motions';
        else if (file.includes('evening')) articleType = 'evening-analysis';
        else if (file.includes('week-ahead')) articleType = 'week-ahead';
        
        results[lang].samples.push({
          file: basename(file),
          type: articleType,
          terms
        });
      } catch (error) {
        skippedFiles.push({ file, reason: `Read error: ${error.message}` });
      }
    }
  } catch (error) {
    console.error(`Error reading directory: ${error.message}`);
    process.exit(1);
  }
  
  return results;
}

/**
 * Generate vocabulary report
 */
function generateReport(results) {
  console.log('\n========================================');
  console.log('Political Vocabulary Analysis Report');
  console.log('========================================\n');
  
  for (const [code, data] of Object.entries(results)) {
    if (data.samples.length === 0) continue;
    
    console.log(`\n## ${data.language} (${code.toUpperCase()})`);
    console.log(`Samples analyzed: ${data.samples.length}`);
    
    // Collect unique labels
    const watchLabels = new Set();
    const committeeLabels = new Set();
    const documentLabels = new Set();
    const mainTitles = new Set();
    
    for (const sample of data.samples) {
      if (sample.terms.watchLabel) watchLabels.add(sample.terms.watchLabel);
      if (sample.terms.committeeLabel) committeeLabels.add(sample.terms.committeeLabel);
      if (sample.terms.documentLabel) documentLabels.add(sample.terms.documentLabel);
      if (sample.terms.mainTitle) mainTitles.add(sample.terms.mainTitle);
    }
    
    if (watchLabels.size > 0) console.log(`  "What to Watch": ${Array.from(watchLabels).join(', ')}`);
    if (committeeLabels.size > 0) console.log(`  "Committee": ${Array.from(committeeLabels).join(', ')}`);
    if (documentLabels.size > 0) console.log(`  "Document": ${Array.from(documentLabels).join(', ')}`);
    
    // Show sample titles from any articles (prioritize committee reports)
    const committeeReports = data.samples.filter(s => s.type === 'committee-reports');
    const sampleWithTitles = committeeReports.find(s => s.terms.titles && s.terms.titles.length > 0) || 
                              data.samples.find(s => s.terms.titles && s.terms.titles.length > 0);
    
    if (sampleWithTitles && sampleWithTitles.terms.titles.length > 0) {
      console.log(`  Sample titles: ${sampleWithTitles.terms.titles.slice(0, 3).join(', ')}`);
    }
  }
  
  // Warning summary
  if (skippedFiles.length > 0) {
    console.log('\n\n⚠️  WARNING: Skipped Files Summary');
    console.log('=====================================');
    console.log(`Total skipped: ${skippedFiles.length}\n`);
    
    // Group by reason
    const byReason = {};
    for (const { file, reason } of skippedFiles) {
      if (!byReason[reason]) byReason[reason] = [];
      byReason[reason].push(file);
    }
    
    for (const [reason, files] of Object.entries(byReason)) {
      console.log(`${reason}: ${files.length} file(s)`);
      if (files.length <= 5) {
        files.forEach(f => console.log(`  - ${f}`));
      } else {
        files.slice(0, 3).forEach(f => console.log(`  - ${f}`));
        console.log(`  ... and ${files.length - 3} more`);
      }
      console.log();
    }
  }
  
  console.log('\n========================================');
  console.log('Analysis complete!');
  console.log('========================================\n');
}

// Parse CLI arguments
const args = process.argv.slice(2);
let datePrefix = null;
let directory = 'news';

for (let i = 0; i < args.length; i++) {
  if (args[i] === '--date-prefix' && args[i + 1]) {
    datePrefix = args[i + 1];
    i++;
  } else if (args[i] === '--directory' && args[i + 1]) {
    directory = args[i + 1];
    i++;
  } else if (args[i] === '--help' || args[i] === '-h') {
    console.log(`
Usage: node scripts/extract-vocabulary.js [options]

Options:
  --date-prefix <prefix>   Filter files by date prefix (e.g., "2026-02-")
  --directory <path>       Directory to scan (default: "news")
  --help, -h              Show this help message

Examples:
  node scripts/extract-vocabulary.js
  node scripts/extract-vocabulary.js --date-prefix 2026-02-
  node scripts/extract-vocabulary.js --directory news --date-prefix 2026-03-
`);
    process.exit(0);
  }
}

// Run analysis
const results = analyzeArticles(directory, datePrefix);
generateReport(results);