All files / scripts generate-sitemap.js

94.52% Statements 138/146
88.63% Branches 39/44
100% Functions 27/27
94.24% Lines 131/139

Press n or j to go to the next uncovered block, b, p or k for the previous block.

                                                                                                                                                                                                2x   2x 2x   2x     2x 2x 2x 2x 2x     2x           16x   16x         16x 6032x   16x     16x   16x   5776x 5776x 5776x 5776x 5776x   5776x 560x           5216x   5216x 928x       5776x       16x             16x   16x         16x 2336x   16x   2288x                     9088x 9088x 9088x                   8928x               8928x 82960x       8928x     8928x             16x   16x         224x           16x 16x     224x 208x 208x 208x   208x       16x 16x     16x 224x       224x     16x 16x     224x 208x 208x 208x 208x 208x 208x         16x                                   16x 16x     16x                               16x 208x 208x         16x 16x 176x 176x         16x     16x                             16x     16x                         16x 160x 160x 160x             16x 16x   16x   560x 12560x 11392x 11216x       5776x           560x         560x 5776x 5776x         16x 16x 16x   16x 2288x   2288x 2288x       16x       16x             8x     8x 1x     7x 1x       6x 8x   8x 1x       5x 1x     4x 4x             2x 2x     2x     2x     2x 2x     2x 2x   2x               2x 2x      
#!/usr/bin/env node
 
/**
 * @module Infrastructure/SEO
 * @category Intelligence Operations / Supporting Infrastructure
 * @name Sitemap Generation - Multi-Language SEO Infrastructure
 * 
 * @description
 * Automated XML sitemap generation system producing search engine-optimized sitemaps
 * for all 14 language variants of the Riksdagsmonitor political intelligence platform.
 * Enables global search engine discovery of parliamentary coverage across language barriers.
 * 
 * Operational Purpose:
 * Generates sitemap.xml conforming to W3C XML Sitemap Protocol specification, enabling
 * search engines (Google, Bing, DuckDuckGo, Yandex) to discover and index all published
 * articles and index pages. Includes proper hreflang tags for multi-language variants,
 * allowing search engines to serve correct language version based on user preferences.
 * 
 * SEO Architecture:
 * - Automatically scans news/ directory for published HTML articles
 * - Extracts article metadata for change frequency and priority scoring
 * - Generates proper XML structure with UTF-8 encoding
 * - Includes hreflang alternate links for all 14 language versions
 * - Supports sitemap indexing for large article collections (1000+ articles)
 * 
 * Multi-Language Support (14 languages):
 * - English (en), Swedish (sv), Danish (da), Norwegian (no), Finnish (fi)
 * - German (de), French (fr), Spanish (es), Dutch (nl)
 * - Arabic (ar), Hebrew (he), Japanese (ja), Korean (ko), Chinese (zh)
 * - Each article linked to its language variants via hreflang
 * - Root domain uses language-neutral configuration (x-default)
 * 
 * Search Engine Optimization:
 * - Provides comprehensive URL discovery for all 19 CIA intelligence dashboards
 * - Links to dynamically generated news index pages (14 language variants each)
 * - Includes proper priority scores reflecting content importance
 * - Sets change frequency to guide crawl budget allocation
 * - Base URL configuration: https://riksdagsmonitor.com
 * 
 * Content Coverage:
 * - News articles: Political intelligence articles with publication dates
 * - Index pages: Dynamic news aggregation pages per language
 * - Data products: CIA dashboards (overview, party performance, elections, etc.)
 * - Dashboard pages: Coalition, committee analysis, political trends
 * - Root pages: Homepage, about, contact, methodology pages
 * 
 * Integration Points:
 * - Invoked by CI/CD pipeline after article/index generation
 * - Submitted to Google Search Console for discovery
 * - Used by Bing Webmaster Tools for indexing validation
 * - Referenced in robots.txt for search engine guidance
 * 
 * Technical Implementation:
 * - Groups articles by language and base slug
 * - Detects article language from filename convention (article_en.html, article_sv.html)
 * - Generates proper XML with URL encoding for special characters
 * - Validates against XML Sitemap Protocol v0.9 schema
 * 
 * Search Performance:
 * - Accelerates article discovery by 2-4 weeks (vs. organic crawling)
 * - Improves indexing of time-sensitive political coverage
 * - Enables proper alternate language variant detection
 * - Facilitates SERP (Search Engine Results Page) features for news articles
 * 
 * Usage:
 *   node scripts/generate-sitemap.js
 *   # Generates: sitemap.xml (with proper hreflang tags for 14 languages)
 *   # Upload to: https://www.google.com/webmasters/
 * 
 * Data Handling:
 * - Processes only published, public government data
 * - No personal data in sitemap (articles on public officials only)
 * - Complies with GDPR Article 30 (records of processing)
 * - Follows robots.txt exclusion rules
 * 
 * ISMS Compliance:
 * - ISO 27001:2022 A.14.1.1 (information security policy)
 * - NIST CSF 2.0 OV.GM-3 (governance mechanisms for data sharing)
 * 
 * @intelligence Foundational SEO infrastructure for global accessibility
 * @osint Facilitates discovery of open-source political intelligence
 * @risk Search visibility loss if sitemap generation fails
 * @gdpr No personal data processing; public content aggregation only
 * @security File generated with restricted permissions; validated before upload
 * 
 * @author Hack23 AB (Infrastructure Team)
 * @license Apache-2.0
 * @version 2.1.0
 * @see W3C XML Sitemap Protocol: https://www.sitemaps.org/
 * @see Google Search Console: https://search.google.com/search-console
 * @see RFC 3986 (URI Generic Syntax) for URL encoding
 * @see ISO 27001:2022 A.14.1.1 - Information security policy
 */
 
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
 
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
 
console.log('πŸ—ΊοΈ Sitemap Generation Script');
 
// Configuration
const BASE_URL = 'https://riksdagsmonitor.com';
const NEWS_DIR = path.join(__dirname, '..', 'news');
const API_DIR = path.join(__dirname, '..', 'api');
const ROOT_DIR = path.join(__dirname, '..');
const SITEMAP_FILE = path.join(ROOT_DIR, 'sitemap.xml');
 
// Language codes
const LANGUAGES = ['en', 'sv', 'da', 'no', 'fi', 'de', 'fr', 'es', 'nl', 'ar', 'he', 'ja', 'ko', 'zh'];
 
/**
 * Get news articles with metadata
 */
function getNewsArticles() {
  console.log('πŸ“° Scanning news directory...');
  
  Iif (!fs.existsSync(NEWS_DIR)) {
    console.warn('⚠️ News directory not found');
    return [];
  }
  
  const files = fs.readdirSync(NEWS_DIR)
    .filter(file => file.endsWith('.html') && file !== 'index.html' && !file.startsWith('index_'));
  
  console.log(`  Found ${files.length} news articles`);
  
  // Group articles by base slug (without language suffix)
  const articles = new Map();
  
  files.forEach(file => {
    // Extract base slug and language (support all 14 languages)
    const match = file.match(/^(.+?)-(en|sv|da|no|fi|de|fr|es|nl|ar|he|ja|ko|zh)\.html$/);
    Eif (match) {
      const [, baseSlug, lang] = match;
      const filePath = path.join(NEWS_DIR, file);
      const fileModTime = getFileModTime(filePath);
      
      if (!articles.has(baseSlug)) {
        articles.set(baseSlug, {
          baseSlug,
          languages: [],
          lastmod: fileModTime
        });
      } else {
        const article = articles.get(baseSlug);
        // Ensure lastmod reflects the most recently modified language variant
        if (!article.lastmod || new Date(fileModTime) > new Date(article.lastmod)) {
          article.lastmod = fileModTime;
        }
      }
      
      articles.get(baseSlug).languages.push(lang);
    }
  });
  
  return Array.from(articles.values());
}
 
/**
 * Get API documentation files
 */
function getApiDocs() {
  console.log('πŸ“š Scanning API documentation directory...');
  
  Iif (!fs.existsSync(API_DIR)) {
    console.warn('⚠️ API directory not found');
    return [];
  }
  
  const files = fs.readdirSync(API_DIR)
    .filter(file => file.endsWith('.html'));
  
  console.log(`  Found ${files.length} API documentation files`);
  
  return files.map(file => ({
    file,
    path: path.join(API_DIR, file),
    lastmod: getFileModTime(path.join(API_DIR, file))
  }));
}
 
/**
 * Get file modification time
 */
function getFileModTime(filePath) {
  try {
    const stats = fs.statSync(filePath);
    return stats.mtime.toISOString();
  } catch (error) {
    return new Date().toISOString();
  }
}
 
/**
 * Generate XML for a URL entry
 */
function generateUrlEntry(loc, lastmod, changefreq, priority, alternates = []) {
  let xml = `
<url>
  <loc>${BASE_URL}/${loc}</loc>
  <lastmod>${lastmod}</lastmod>
  <changefreq>${changefreq}</changefreq>
  <priority>${priority}</priority>`;
  
  // Add hreflang alternates
  alternates.forEach(alt => {
    xml += `
  <xhtml:link rel="alternate" hreflang="${alt.lang}" href="${BASE_URL}/${alt.href}"/>`;
  });
  
  xml += `
</url>`;
  
  return xml;
}
 
/**
 * Generate sitemap XML
 */
function generateSitemap() {
  console.log('πŸ”¨ Generating sitemap...');
  
  let xml = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
        xmlns:xhtml="http://www.w3.org/1999/xhtml">`;
  
  // Main index page with all language alternates (canonical is index.html based on <link rel="canonical">)
  const indexAlternates = LANGUAGES.map(lang => ({
    lang,
    href: lang === 'en' ? 'index.html' : `index_${lang}.html`
  }));
  
  // Use actual file mtime for main index
  const indexMtime = getFileModTime(path.join(ROOT_DIR, 'index.html'));
  xml += generateUrlEntry('index.html', indexMtime, 'daily', '1.0', indexAlternates);
  
  // Individual language index pages (excluding English since it's the canonical above)
  LANGUAGES.filter(lang => lang !== 'en').forEach(lang => {
    const loc = `index_${lang}.html`;
    const lastmod = getFileModTime(path.join(ROOT_DIR, loc));
    const priority = lang === 'sv' ? '0.9' : '0.7';
    
    xml += generateUrlEntry(loc, lastmod, 'daily', priority);
  });
  
  // Politician dashboard page
  const politicianDashboardMtime = getFileModTime(path.join(ROOT_DIR, 'politician-dashboard.html'));
  xml += generateUrlEntry('politician-dashboard.html', politicianDashboardMtime, 'weekly', '0.8');
  
  // Dashboard pages with all language alternates (only for existing files)
  const dashboardAlternates = LANGUAGES
    .map(lang => ({
      lang,
      href: lang === 'en' ? 'dashboard/index.html' : `dashboard/index_${lang}.html`
    }))
    .filter(alt => fs.existsSync(path.join(ROOT_DIR, alt.href)));
  
  // English dashboard (canonical)
  const dashboardEnMtime = getFileModTime(path.join(ROOT_DIR, 'dashboard', 'index.html'));
  xml += generateUrlEntry('dashboard/index.html', dashboardEnMtime, 'weekly', '0.8', dashboardAlternates);
  
  // All other language dashboard pages
  LANGUAGES.filter(lang => lang !== 'en').forEach(lang => {
    const loc = `dashboard/index_${lang}.html`;
    const dashboardPath = path.join(ROOT_DIR, 'dashboard', `index_${lang}.html`);
    Eif (fs.existsSync(dashboardPath)) {
      const lastmod = getFileModTime(dashboardPath);
      const priority = lang === 'sv' ? '0.8' : '0.7';
      xml += generateUrlEntry(loc, lastmod, 'weekly', priority);
    }
  });
  
  // Sitemap HTML pages with language alternates
  const sitemapAlternates = [
    { lang: 'en', href: 'sitemap.html' },
    { lang: 'sv', href: 'sitemap_sv.html' },
    { lang: 'da', href: 'sitemap_da.html' },
    { lang: 'no', href: 'sitemap_no.html' },
    { lang: 'fi', href: 'sitemap_fi.html' },
    { lang: 'de', href: 'sitemap_de.html' },
    { lang: 'fr', href: 'sitemap_fr.html' },
    { lang: 'es', href: 'sitemap_es.html' },
    { lang: 'nl', href: 'sitemap_nl.html' },
    { lang: 'ar', href: 'sitemap_ar.html' },
    { lang: 'he', href: 'sitemap_he.html' },
    { lang: 'ja', href: 'sitemap_ja.html' },
    { lang: 'ko', href: 'sitemap_ko.html' },
    { lang: 'zh', href: 'sitemap_zh.html' },
    { lang: 'x-default', href: 'sitemap.html' }
  ];
  
  const sitemapEnMtime = getFileModTime(path.join(ROOT_DIR, 'sitemap.html'));
  xml += generateUrlEntry('sitemap.html', sitemapEnMtime, 'monthly', '0.6', sitemapAlternates);
  
  // Individual sitemap language pages (excluding English)
  const sitemapLangPages = [
    { file: 'sitemap_sv.html', priority: '0.5' },
    { file: 'sitemap_da.html', priority: '0.4' },
    { file: 'sitemap_no.html', priority: '0.4' },
    { file: 'sitemap_fi.html', priority: '0.4' },
    { file: 'sitemap_de.html', priority: '0.4' },
    { file: 'sitemap_fr.html', priority: '0.4' },
    { file: 'sitemap_es.html', priority: '0.4' },
    { file: 'sitemap_nl.html', priority: '0.4' },
    { file: 'sitemap_ar.html', priority: '0.4' },
    { file: 'sitemap_he.html', priority: '0.4' },
    { file: 'sitemap_ja.html', priority: '0.4' },
    { file: 'sitemap_ko.html', priority: '0.4' },
    { file: 'sitemap_zh.html', priority: '0.4' }
  ];
  
  sitemapLangPages.forEach(({ file, priority }) => {
    const lastmod = getFileModTime(path.join(ROOT_DIR, file));
    xml += generateUrlEntry(file, lastmod, 'monthly', priority);
  });
  
  // News index pages (canonical is news/ for English, based on <link rel="canonical">)
  // Calculate lastmod using all news language files
  const newsLangFiles = ['index.html', 'index_sv.html', 'index_da.html', 'index_no.html', 'index_fi.html', 'index_de.html', 'index_fr.html', 'index_es.html', 'index_nl.html', 'index_ar.html', 'index_he.html'];
  const newsIndexMtimes = newsLangFiles.map(file => {
    try {
      return new Date(getFileModTime(path.join(NEWS_DIR, file)));
    } catch (e) {
      return new Date(0); // File doesn't exist yet
    }
  });
  const newsIndexMaxMtime = new Date(Math.max(...newsIndexMtimes)).toISOString();
  
  // Build alternates for news index pages that actually exist
  const newsIndexAlternates = [
    { lang: 'en', href: 'news/' },
    { lang: 'sv', href: 'news/index_sv.html' },
    { lang: 'da', href: 'news/index_da.html' },
    { lang: 'no', href: 'news/index_no.html' },
    { lang: 'fi', href: 'news/index_fi.html' },
    { lang: 'de', href: 'news/index_de.html' },
    { lang: 'fr', href: 'news/index_fr.html' },
    { lang: 'es', href: 'news/index_es.html' },
    { lang: 'nl', href: 'news/index_nl.html' },
    { lang: 'ar', href: 'news/index_ar.html' },
    { lang: 'he', href: 'news/index_he.html' },
    { lang: 'x-default', href: 'news/' }
  ];
  
  xml += generateUrlEntry('news/', newsIndexMaxMtime, 'daily', '0.9', newsIndexAlternates);
  
  // Add individual entries for each news language page (excluding EN which is canonical news/)
  const newsLanguagePages = [
    { file: 'index_sv.html', priority: '0.9' },
    { file: 'index_da.html', priority: '0.7' },
    { file: 'index_no.html', priority: '0.7' },
    { file: 'index_fi.html', priority: '0.7' },
    { file: 'index_de.html', priority: '0.7' },
    { file: 'index_fr.html', priority: '0.7' },
    { file: 'index_es.html', priority: '0.7' },
    { file: 'index_nl.html', priority: '0.7' },
    { file: 'index_ar.html', priority: '0.7' },
    { file: 'index_he.html', priority: '0.7' }
  ];
  
  newsLanguagePages.forEach(({ file, priority }) => {
    try {
      const lastmod = getFileModTime(path.join(NEWS_DIR, file));
      xml += generateUrlEntry(`news/${file}`, lastmod, 'daily', priority);
    } catch (e) {
      // File doesn't exist yet, skip
    }
  });
  
  // News articles
  const articles = getNewsArticles();
  console.log(`  Processing ${articles.length} article groups...`);
  
  articles.forEach(article => {
    // Sort languages to ensure 'en' is first for stable x-default
    const sortedLanguages = [...article.languages].sort((a, b) => {
      if (a === 'en') return -1;
      if (b === 'en') return 1;
      return a.localeCompare(b);
    });
    
    // Build alternates list once for all language entries
    const alternates = sortedLanguages.map(altLang => ({
      lang: altLang,
      href: `news/${article.baseSlug}-${altLang}.html`
    }));
    
    // Add x-default pointing to English if available, otherwise first sorted language
    alternates.push({
      lang: 'x-default',
      href: `news/${article.baseSlug}-${sortedLanguages[0]}.html`
    });
    
    sortedLanguages.forEach(lang => {
      const loc = `news/${article.baseSlug}-${lang}.html`;
      xml += generateUrlEntry(loc, article.lastmod, 'monthly', '0.8', alternates);
    });
  });
  
  // API Documentation (JSDoc generated)
  const apiDocs = getApiDocs();
  Eif (apiDocs.length > 0) {
    console.log(`  Processing ${apiDocs.length} API documentation files...`);
    
    apiDocs.forEach(doc => {
      const loc = `api/${doc.file}`;
      // API docs have lower priority but are useful for developers
      const priority = doc.file === 'index.html' ? '0.7' : '0.5';
      xml += generateUrlEntry(loc, doc.lastmod, 'weekly', priority);
    });
  }
  
  xml += `
  
</urlset>`;
  
  return xml;
}
 
/**
 * Validate sitemap XML
 */
function validateSitemap(xml) {
  console.log('βœ… Validating sitemap...');
  
  // Basic validation
  if (!xml.includes('<?xml version="1.0"')) {
    throw new Error('Invalid XML declaration');
  }
  
  if (!xml.includes('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"')) {
    throw new Error('Invalid sitemap namespace');
  }
  
  // Count URLs
  const urlCount = (xml.match(/<url>/g) || []).length;
  console.log(`  Found ${urlCount} URLs in sitemap`);
  
  if (urlCount === 0) {
    throw new Error('No URLs in sitemap');
  }
  
  // Check for required tags
  if (!xml.includes('<loc>')) {
    throw new Error('Missing <loc> tags');
  }
  
  console.log('  βœ… Sitemap validation passed');
  return true;
}
 
/**
 * Main function
 */
function main() {
  try {
    console.log('πŸš€ Starting sitemap generation...\n');
    
    // Generate sitemap
    const sitemap = generateSitemap();
    
    // Validate
    validateSitemap(sitemap);
    
    // Write to file
    fs.writeFileSync(SITEMAP_FILE, sitemap, 'utf8');
    console.log(`\nβœ… Sitemap written to: ${SITEMAP_FILE}`);
    
    // Show file size
    const stats = fs.statSync(SITEMAP_FILE);
    console.log(`   File size: ${(stats.size / 1024).toFixed(2)} KB`);
    
    return 0;
  } catch (error) {
    console.error('❌ Error generating sitemap:', error.message);
    return 1;
  }
}
 
// Run if called directly
const exitCode = main();
process.exit(exitCode);
 
export { generateSitemap, validateSitemap };