Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | 3545x 3545x 3545x 3545x 3545x 3545x 3545x 3545x 3545x 3x | /**
* @module Infrastructure/Rss/ArticleMeta
* @category Intelligence Operations / Supporting Infrastructure
* @name Article metadata extractor
*
* @description
* Reads a single news article HTML file and extracts the RSS-relevant
* fields — title, description, pub date, author, category — by parsing
* `<title>`, `<meta name="description">`, `<meta property="article:…">`,
* and `<meta name="author">` tags. Falls back to `stablePubDate` for the
* pub date, "Riksdagsmonitor" for the author, and "Political Analysis"
* for the category.
*
* Round-6 split: extracted from `scripts/generate-rss.ts`.
*
* @author Hack23 AB (Infrastructure Team)
* @license Apache-2.0
*/
import fs from 'fs';
import path from 'path';
import { stablePubDate } from './pub-date.js';
/** Article-level metadata extracted from the page HTML. */
export interface ArticleMeta {
title: string;
description: string;
pubDate: string;
author: string;
category: string;
}
export function extractArticleMeta(filePath: string): ArticleMeta {
const fallbackDate = stablePubDate(filePath);
try {
const content = fs.readFileSync(filePath, 'utf8');
const titleMatch = content.match(/<title>([^<]+)<\/title>/i);
const descMatch = content.match(/<meta\s+name="description"\s+content="([^"]+)"/i);
const pubDateMatch = content.match(/<meta\s+property="article:published_time"\s+content="([^"]+)"/i);
const authorMatch = content.match(/<meta\s+name="author"\s+content="([^"]+)"/i);
const sectionMatch = content.match(/<meta\s+property="article:section"\s+content="([^"]+)"/i);
return {
title: titleMatch ? titleMatch[1]!.trim() : path.basename(filePath, '.html'),
description: descMatch ? descMatch[1]!.trim() : '',
pubDate: pubDateMatch ? pubDateMatch[1]!.trim() : fallbackDate,
author: authorMatch ? authorMatch[1]!.trim() : 'Riksdagsmonitor',
category: sectionMatch ? sectionMatch[1]!.trim() : 'Political Analysis',
};
} catch (_error: unknown) {
return {
title: path.basename(filePath, '.html'),
description: '',
pubDate: fallbackDate,
author: 'Riksdagsmonitor',
category: 'Political Analysis',
};
}
}
|