All files / scripts/rss article-meta.ts

100% Statements 10/10
80% Branches 8/10
100% Functions 1/1
100% Lines 10/10

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61                                                                    3545x 3545x 3545x 3545x 3545x 3545x 3545x 3545x   3545x               3x                  
/**
 * @module Infrastructure/Rss/ArticleMeta
 * @category Intelligence Operations / Supporting Infrastructure
 * @name Article metadata extractor
 *
 * @description
 * Reads a single news article HTML file and extracts the RSS-relevant
 * fields — title, description, pub date, author, category — by parsing
 * `<title>`, `<meta name="description">`, `<meta property="article:…">`,
 * and `<meta name="author">` tags. Falls back to `stablePubDate` for the
 * pub date, "Riksdagsmonitor" for the author, and "Political Analysis"
 * for the category.
 *
 * Round-6 split: extracted from `scripts/generate-rss.ts`.
 *
 * @author Hack23 AB (Infrastructure Team)
 * @license Apache-2.0
 */
 
import fs from 'fs';
import path from 'path';
 
import { stablePubDate } from './pub-date.js';
 
/** Article-level metadata extracted from the page HTML. */
export interface ArticleMeta {
  title: string;
  description: string;
  pubDate: string;
  author: string;
  category: string;
}
 
export function extractArticleMeta(filePath: string): ArticleMeta {
  const fallbackDate = stablePubDate(filePath);
  try {
    const content = fs.readFileSync(filePath, 'utf8');
    const titleMatch = content.match(/<title>([^<]+)<\/title>/i);
    const descMatch = content.match(/<meta\s+name="description"\s+content="([^"]+)"/i);
    const pubDateMatch = content.match(/<meta\s+property="article:published_time"\s+content="([^"]+)"/i);
    const authorMatch = content.match(/<meta\s+name="author"\s+content="([^"]+)"/i);
    const sectionMatch = content.match(/<meta\s+property="article:section"\s+content="([^"]+)"/i);
 
    return {
      title: titleMatch ? titleMatch[1]!.trim() : path.basename(filePath, '.html'),
      description: descMatch ? descMatch[1]!.trim() : '',
      pubDate: pubDateMatch ? pubDateMatch[1]!.trim() : fallbackDate,
      author: authorMatch ? authorMatch[1]!.trim() : 'Riksdagsmonitor',
      category: sectionMatch ? sectionMatch[1]!.trim() : 'Political Analysis',
    };
  } catch (_error: unknown) {
    return {
      title: path.basename(filePath, '.html'),
      description: '',
      pubDate: fallbackDate,
      author: 'Riksdagsmonitor',
      category: 'Political Analysis',
    };
  }
}