scraper parse.ts

96.96% Statements 32/33
70% Branches 14/20
66.66% Functions 2/3
100% Lines 29/29
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11x
 
11x
11x
9x
9x
8x
9x
9x
 
 
11x
4x
4x
1x
1x
1x
1x
 
 
 
11x
 
 
9x
 
 
 
 
 
 
 
 
 
 
 
8x
 
 
 
8x
8x
 
 
8x
8x
 
 
 
 
 
 
 
 
6x
1x
 
 
 
 
5x
 
3x
3x
 
8x
 
 
5x
 
  /**
 * @module scripts/fetch-calendar/scraper/parse
 * @description Entry point for the Riksdag kalendarium HTML scraper.
 *
 * Dispatches to the article-block parser first (`<article class="calendar-item">`)
 * and falls back to the list-item parser (`<li class="calendar-list__item">`)
 * when no article blocks are found.
 *
 * Defensive regex-based — matches the style of `statskontoret-client.ts`'s
 * link extractor. Per `Threat_Modeling.md` no external HTML parser is used,
 * so the failure mode is "no events" rather than "process aborts on bad
 * HTML".
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import type { CalendarEvent, CalendarFetchConfig } from '../types.js';
import { parseCalendarArticle } from './article-block.js';
import { parseCalendarListItem } from './list-item.js';
import { hasCalendarItemClass } from './extractors.js';
 
/**
 * Parse the HTML returned by `https://www.riksdagen.se/sv/kalendarium/` and
 * extract calendar events into the normalized `CalendarEvent` shape.
 */
export function parseRiksdagKalendariumHtml(html: string): CalendarEvent[] {
  const events: CalendarEvent[] = [];
 
  const articleRe = /<article\b([^>]*)>([\s\S]*?)<\/article>/gi;
  for (const articleMatch of html.matchAll(articleRe)) {
    const attrs = articleMatch[1] ?? '';
    if (!hasCalendarItemClass(attrs)) continue;
    const body = articleMatch[2] ?? '';
    const event = parseCalendarArticle(attrs, body);
    if (event) events.push(event);
  }
 
  if (events.length === 0) {
    const liRe = /<li\b([^>]*class=(["'])[^"']*calendar[^"']*\2[^>]*)>([\s\S]*?)<\/li>/gi;
    for (const liMatch of html.matchAll(liRe)) {
      const attrs = liMatch[1] ?? '';
      const body = liMatch[3] ?? '';
      const event = parseCalendarListItem(attrs, body);
      Eif (event) events.push(event);
    }
  }
 
  return events;
}
 
export const DEFAULT_WEB_BASE_URL = 'https://www.riksdagen.se';
 
/**
 * Fetch the Riksdag web calendar for a date range and parse events.
 *
 * URL: `https://www.riksdagen.se/sv/kalendarium/?from={from}&tom={to}`
 */
export async function fetchWebCalendar(
  from: string,
  to: string,
  config: Required<Pick<CalendarFetchConfig, 'webBaseUrl' | 'timeout' | 'fetchFn'>>,
): Promise<CalendarEvent[]> {
  const url = `${config.webBaseUrl}/sv/kalendarium/?from=${encodeURIComponent(
    from,
  )}&tom=${encodeURIComponent(to)}`;
 
  const controller = new AbortController();
  const tid = setTimeout(() => controller.abort(), config.timeout);
 
  let html: string;
  try {
    const response = await config.fetchFn(url, {
      signal: controller.signal,
      headers: {
        Accept: 'text/html,application/xhtml+xml',
        'Accept-Language': 'sv-SE,sv;q=0.9,en;q=0.8',
        'User-Agent': 'riksdagsmonitor-news-bot/1.0 (+https://riksdagsmonitor.com)',
      },
    });
 
    if (!response.ok) {
      throw new Error(
        `Riksdag web calendar HTTP error: ${response.status} ${response.statusText}`,
      );
    }
 
    html = await response.text();
  } catch (err) {
    const msg = err instanceof Error ? err.message : String(err);
    throw new Error(`Riksdag web calendar fetch failed: ${msg}`, { cause: err });
  } finally {
    clearTimeout(tid);
  }
 
  return parseRiksdagKalendariumHtml(html);
}