All files / scripts/fetch-calendar/scraper extractors.ts

100% Statements 28/28
71.42% Branches 25/35
100% Functions 11/11
100% Lines 28/28

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114                                33x         16x 16x         26x 26x 26x         11x 11x               7x 7x       7x 7x                     11x 11x 11x   11x 11x 11x 5x 5x 5x       11x         2x 2x         7x                     17x         13x         11x                
/**
 * @module scripts/fetch-calendar/scraper/extractors
 * @description Low-level HTML extraction primitives used by the
 * Riksdag kalendarium scraper.
 *
 * Each helper is intentionally small and regex-based (no external HTML
 * parser) so they can be fuzz-tested individually against malformed HTML.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { decodeHtmlEntities } from '../../html-utils.js';
 
/** Escape a string for safe use in a `new RegExp(...)` constructor. */
export function escapeRegex(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
 
/** Extract the `datetime` attribute from a `<time>` element. */
export function extractDatetime(html: string): string | null {
  const m = html.match(/<time\b[^>]*\bdatetime=(["'])(.*?)\1/i);
  return m ? (m[2] ?? null) : null;
}
 
/** Extract a `data-{attr}` attribute value from a tag's attribute string. */
export function extractDataAttr(attrs: string, name: string): string | null {
  const re = new RegExp(`\\bdata-${escapeRegex(name)}\\s*=\\s*(["'])(.*?)\\1`, 'i');
  const m = attrs.match(re);
  return m && m[2]?.trim() ? m[2].trim() : null;
}
 
/** True when an element attribute string contains a `calendar-item` class token. */
export function hasCalendarItemClass(attrs: string): boolean {
  const m = attrs.match(/\bclass\s*=\s*(["'])(.*?)\1/i);
  return m ? (m[2] ?? '').split(/\s+/).includes('calendar-item') : false;
}
 
/**
 * Extract the inner text of a `<span>` whose class contains `{name}`.
 * Uses a simple, non-greedy regex that covers the common markup pattern.
 */
export function extractSpanText(html: string, name: string): string | null {
  const safe = escapeRegex(name);
  const re = new RegExp(
    `<span\\b[^>]*\\bclass\\s*=\\s*(["'])[^"']*${safe}[^"']*\\1[^>]*>([\\s\\S]*?)<\\/span>`,
    'i',
  );
  const m = html.match(re);
  return m ? stripTags(m[2] ?? '').trim() || null : null;
}
 
/**
 * Extract the heading text (h1–h6 or first anchor) and any document
 * reference links from an event block.
 */
export function extractHeadingAndLinks(html: string): {
  summary: string;
  docRefs: string[];
} {
  const headingRe = /<h[1-6]\b[^>]*>([\s\S]*?)<\/h[1-6]>/i;
  const headingMatch = html.match(headingRe);
  const summary = headingMatch ? (headingMatch[1] ?? '') : extractFirstAnchorText(html);
 
  const docRefs: string[] = [];
  const hrefRe = /<a\b[^>]*\bhref=(["'])([^"']+)\1[^>]*>/gi;
  for (const m of html.matchAll(hrefRe)) {
    const href = (m[2] ?? '').trim();
    Eif (isRiksdagDocumentHref(href)) {
      docRefs.push(href);
    }
  }
 
  return { summary, docRefs };
}
 
/** Extract the text of the first `<a>` anchor in an HTML fragment. */
export function extractFirstAnchorText(html: string): string {
  const m = html.match(/<a\b[^>]*>([\s\S]*?)<\/a>/i);
  return m ? (m[1] ?? '') : '';
}
 
/** True when an href looks like a Riksdag document or proceedings link. */
export function isRiksdagDocumentHref(href: string): boolean {
  return (
    href.includes('/dokument') ||
    href.includes('/betankanden') ||
    href.includes('/propositioner') ||
    href.includes('/motioner') ||
    href.includes('/interpellationer')
  );
}
 
/** Remove all HTML tags from a string. */
export function stripTags(html: string): string {
  return html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ');
}
 
/** Normalize committee/organ codes by collapsing whitespace and trimming only. */
export function normalizeOrgCode(raw: string): string {
  return raw.replace(/\s+/g, ' ').trim();
}
 
/** Normalize activity type strings to lower-case-with-hyphens. */
export function normalizeAkt(raw: string): string {
  return raw
    .toLowerCase()
    .replace(/\s+/g, '-')
    .replace(/[^a-z0-9-åäö]/g, '')
    .trim();
}
 
export { decodeHtmlEntities };