Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | 10x 10x 10x 8x 8x 7x 8x 8x 10x 4x 4x 1x 1x 1x 1x 10x 9x 7x 7x 7x 7x 7x 5x 1x 4x 3x 3x 7x 4x | /**
* @module scripts/fetch-calendar/scraper/parse
* @description Entry point for the Riksdag kalendarium HTML scraper.
*
* Dispatches to the article-block parser first (`<article class="calendar-item">`)
* and falls back to the list-item parser (`<li class="calendar-list__item">`)
* when no article blocks are found.
*
* Defensive regex-based — matches the style of `statskontoret-client.ts`'s
* link extractor. Per `Threat_Modeling.md` no external HTML parser is used,
* so the failure mode is "no events" rather than "process aborts on bad
* HTML".
*
* @author Hack23 AB
* @license Apache-2.0
*/
import type { CalendarEvent, CalendarFetchConfig } from '../types.js';
import { parseCalendarArticle } from './article-block.js';
import { parseCalendarListItem } from './list-item.js';
import { hasCalendarItemClass } from './extractors.js';
/**
* Parse the HTML returned by `https://www.riksdagen.se/sv/kalendarium/` and
* extract calendar events into the normalized `CalendarEvent` shape.
*/
export function parseRiksdagKalendariumHtml(html: string): CalendarEvent[] {
const events: CalendarEvent[] = [];
const articleRe = /<article\b([^>]*)>([\s\S]*?)<\/article>/gi;
for (const articleMatch of html.matchAll(articleRe)) {
const attrs = articleMatch[1] ?? '';
if (!hasCalendarItemClass(attrs)) continue;
const body = articleMatch[2] ?? '';
const event = parseCalendarArticle(attrs, body);
if (event) events.push(event);
}
if (events.length === 0) {
const liRe = /<li\b([^>]*class=(["'])[^"']*calendar[^"']*\2[^>]*)>([\s\S]*?)<\/li>/gi;
for (const liMatch of html.matchAll(liRe)) {
const attrs = liMatch[1] ?? '';
const body = liMatch[3] ?? '';
const event = parseCalendarListItem(attrs, body);
Eif (event) events.push(event);
}
}
return events;
}
export const DEFAULT_WEB_BASE_URL = 'https://www.riksdagen.se';
/**
* Fetch the Riksdag web calendar for a date range and parse events.
*
* URL: `https://www.riksdagen.se/sv/kalendarium/?from={from}&tom={to}`
*/
export async function fetchWebCalendar(
from: string,
to: string,
config: Required<Pick<CalendarFetchConfig, 'webBaseUrl' | 'timeout' | 'fetchFn'>>,
): Promise<CalendarEvent[]> {
const url = `${config.webBaseUrl}/sv/kalendarium/?from=${encodeURIComponent(
from,
)}&tom=${encodeURIComponent(to)}`;
const controller = new AbortController();
const tid = setTimeout(() => controller.abort(), config.timeout);
let html: string;
try {
const response = await config.fetchFn(url, {
signal: controller.signal,
headers: {
Accept: 'text/html,application/xhtml+xml',
'Accept-Language': 'sv-SE,sv;q=0.9,en;q=0.8',
'User-Agent': 'riksdagsmonitor-news-bot/1.0 (+https://riksdagsmonitor.com)',
},
});
if (!response.ok) {
throw new Error(
`Riksdag web calendar HTTP error: ${response.status} ${response.statusText}`,
);
}
html = await response.text();
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
throw new Error(`Riksdag web calendar fetch failed: ${msg}`, { cause: err });
} finally {
clearTimeout(tid);
}
return parseRiksdagKalendariumHtml(html);
}
|