scraper extractors.ts

100% Statements 28/28
71.42% Branches 25/35
100% Functions 11/11
100% Lines 28/28
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35x
 
 
 
 
17x
17x
 
 
 
 
28x
28x
28x
 
 
 
 
12x
12x
 
 
 
 
 
 
 
7x
7x
 
 
 
7x
7x
 
 
 
 
 
 
 
 
 
 
12x
12x
12x
 
12x
12x
12x
5x
5x
5x
 
 
 
12x
 
 
 
 
2x
2x
 
 
 
 
7x
 
 
 
 
 
 
 
 
 
 
18x
 
 
 
 
14x
 
 
 
 
12x
 
 
 
 
 
 
 
  /**
 * @module scripts/fetch-calendar/scraper/extractors
 * @description Low-level HTML extraction primitives used by the
 * Riksdag kalendarium scraper.
 *
 * Each helper is intentionally small and regex-based (no external HTML
 * parser) so they can be fuzz-tested individually against malformed HTML.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { decodeHtmlEntities } from '../../html-utils.js';
 
/** Escape a string for safe use in a `new RegExp(...)` constructor. */
export function escapeRegex(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
 
/** Extract the `datetime` attribute from a `<time>` element. */
export function extractDatetime(html: string): string | null {
  const m = html.match(/<time\b[^>]*\bdatetime=(["'])(.*?)\1/i);
  return m ? (m[2] ?? null) : null;
}
 
/** Extract a `data-{attr}` attribute value from a tag's attribute string. */
export function extractDataAttr(attrs: string, name: string): string | null {
  const re = new RegExp(`\\bdata-${escapeRegex(name)}\\s*=\\s*(["'])(.*?)\\1`, 'i');
  const m = attrs.match(re);
  return m && m[2]?.trim() ? m[2].trim() : null;
}
 
/** True when an element attribute string contains a `calendar-item` class token. */
export function hasCalendarItemClass(attrs: string): boolean {
  const m = attrs.match(/\bclass\s*=\s*(["'])(.*?)\1/i);
  return m ? (m[2] ?? '').split(/\s+/).includes('calendar-item') : false;
}
 
/**
 * Extract the inner text of a `<span>` whose class contains `{name}`.
 * Uses a simple, non-greedy regex that covers the common markup pattern.
 */
export function extractSpanText(html: string, name: string): string | null {
  const safe = escapeRegex(name);
  const re = new RegExp(
    `<span\\b[^>]*\\bclass\\s*=\\s*(["'])[^"']*${safe}[^"']*\\1[^>]*>([\\s\\S]*?)<\\/span>`,
    'i',
  );
  const m = html.match(re);
  return m ? stripTags(m[2] ?? '').trim() || null : null;
}
 
/**
 * Extract the heading text (h1–h6 or first anchor) and any document
 * reference links from an event block.
 */
export function extractHeadingAndLinks(html: string): {
  summary: string;
  docRefs: string[];
} {
  const headingRe = /<h[1-6]\b[^>]*>([\s\S]*?)<\/h[1-6]>/i;
  const headingMatch = html.match(headingRe);
  const summary = headingMatch ? (headingMatch[1] ?? '') : extractFirstAnchorText(html);
 
  const docRefs: string[] = [];
  const hrefRe = /<a\b[^>]*\bhref=(["'])([^"']+)\1[^>]*>/gi;
  for (const m of html.matchAll(hrefRe)) {
    const href = (m[2] ?? '').trim();
    Eif (isRiksdagDocumentHref(href)) {
      docRefs.push(href);
    }
  }
 
  return { summary, docRefs };
}
 
/** Extract the text of the first `<a>` anchor in an HTML fragment. */
export function extractFirstAnchorText(html: string): string {
  const m = html.match(/<a\b[^>]*>([\s\S]*?)<\/a>/i);
  return m ? (m[1] ?? '') : '';
}
 
/** True when an href looks like a Riksdag document or proceedings link. */
export function isRiksdagDocumentHref(href: string): boolean {
  return (
    href.includes('/dokument') ||
    href.includes('/betankanden') ||
    href.includes('/propositioner') ||
    href.includes('/motioner') ||
    href.includes('/interpellationer')
  );
}
 
/** Remove all HTML tags from a string. */
export function stripTags(html: string): string {
  return html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ');
}
 
/** Normalize committee/organ codes by collapsing whitespace and trimming only. */
export function normalizeOrgCode(raw: string): string {
  return raw.replace(/\s+/g, ' ').trim();
}
 
/** Normalize activity type strings to lower-case-with-hyphens. */
export function normalizeAkt(raw: string): string {
  return raw
    .toLowerCase()
    .replace(/\s+/g, '-')
    .replace(/[^a-z0-9-åäö]/g, '')
    .trim();
}
 
export { decodeHtmlEntities };