Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | 33x 16x 16x 26x 26x 26x 11x 11x 7x 7x 7x 7x 11x 11x 11x 11x 11x 11x 5x 5x 5x 11x 2x 2x 7x 17x 13x 11x | /**
* @module scripts/fetch-calendar/scraper/extractors
* @description Low-level HTML extraction primitives used by the
* Riksdag kalendarium scraper.
*
* Each helper is intentionally small and regex-based (no external HTML
* parser) so they can be fuzz-tested individually against malformed HTML.
*
* @author Hack23 AB
* @license Apache-2.0
*/
import { decodeHtmlEntities } from '../../html-utils.js';
/** Escape a string for safe use in a `new RegExp(...)` constructor. */
export function escapeRegex(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
/** Extract the `datetime` attribute from a `<time>` element. */
export function extractDatetime(html: string): string | null {
const m = html.match(/<time\b[^>]*\bdatetime=(["'])(.*?)\1/i);
return m ? (m[2] ?? null) : null;
}
/** Extract a `data-{attr}` attribute value from a tag's attribute string. */
export function extractDataAttr(attrs: string, name: string): string | null {
const re = new RegExp(`\\bdata-${escapeRegex(name)}\\s*=\\s*(["'])(.*?)\\1`, 'i');
const m = attrs.match(re);
return m && m[2]?.trim() ? m[2].trim() : null;
}
/** True when an element attribute string contains a `calendar-item` class token. */
export function hasCalendarItemClass(attrs: string): boolean {
const m = attrs.match(/\bclass\s*=\s*(["'])(.*?)\1/i);
return m ? (m[2] ?? '').split(/\s+/).includes('calendar-item') : false;
}
/**
* Extract the inner text of a `<span>` whose class contains `{name}`.
* Uses a simple, non-greedy regex that covers the common markup pattern.
*/
export function extractSpanText(html: string, name: string): string | null {
const safe = escapeRegex(name);
const re = new RegExp(
`<span\\b[^>]*\\bclass\\s*=\\s*(["'])[^"']*${safe}[^"']*\\1[^>]*>([\\s\\S]*?)<\\/span>`,
'i',
);
const m = html.match(re);
return m ? stripTags(m[2] ?? '').trim() || null : null;
}
/**
* Extract the heading text (h1–h6 or first anchor) and any document
* reference links from an event block.
*/
export function extractHeadingAndLinks(html: string): {
summary: string;
docRefs: string[];
} {
const headingRe = /<h[1-6]\b[^>]*>([\s\S]*?)<\/h[1-6]>/i;
const headingMatch = html.match(headingRe);
const summary = headingMatch ? (headingMatch[1] ?? '') : extractFirstAnchorText(html);
const docRefs: string[] = [];
const hrefRe = /<a\b[^>]*\bhref=(["'])([^"']+)\1[^>]*>/gi;
for (const m of html.matchAll(hrefRe)) {
const href = (m[2] ?? '').trim();
Eif (isRiksdagDocumentHref(href)) {
docRefs.push(href);
}
}
return { summary, docRefs };
}
/** Extract the text of the first `<a>` anchor in an HTML fragment. */
export function extractFirstAnchorText(html: string): string {
const m = html.match(/<a\b[^>]*>([\s\S]*?)<\/a>/i);
return m ? (m[1] ?? '') : '';
}
/** True when an href looks like a Riksdag document or proceedings link. */
export function isRiksdagDocumentHref(href: string): boolean {
return (
href.includes('/dokument') ||
href.includes('/betankanden') ||
href.includes('/propositioner') ||
href.includes('/motioner') ||
href.includes('/interpellationer')
);
}
/** Remove all HTML tags from a string. */
export function stripTags(html: string): string {
return html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ');
}
/** Normalize committee/organ codes by collapsing whitespace and trimming only. */
export function normalizeOrgCode(raw: string): string {
return raw.replace(/\s+/g, ' ').trim();
}
/** Normalize activity type strings to lower-case-with-hyphens. */
export function normalizeAkt(raw: string): string {
return raw
.toLowerCase()
.replace(/\s+/g, '-')
.replace(/[^a-z0-9-åäö]/g, '')
.trim();
}
export { decodeHtmlEntities };
|