All files / scripts/statskontoret/extractors download-links.ts

90.69% Statements 39/43
75% Branches 45/60
100% Functions 4/4
100% Lines 35/35

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113                                                    4x 4x 4x               4x 4x 4x 7x 7x 7x 7x 7x 7x 7x 7x 7x 7x                                               4x             11x 11x         7x   4x 2x 2x 2x 1x 1x           4x 4x 4x 7x 6x 6x   4x       4x 4x    
/**
 * @module scripts/statskontoret/extractors/download-links
 * @description Extract downloadable Excel/CSV-ZIP/document links from a
 * Statskontoret open-data HTML page, with provenance attributes.
 *
 * Defensive regex-based scraper — no external HTML parser — matches the
 * style of the rest of the Statskontoret client. The classifier mirrors
 * the firewall allowlist's file-extension allow-list.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { STATSKONTORET_BASE_URL } from '../source-registry.js';
import type {
  StatskontoretDownloadLink,
  StatskontoretResourceType,
  StatskontoretSourceKey,
} from '../types.js';
import {
  decodeHtml,
  normalizeWhitespace,
  parseStatskontoretOptionalInt,
} from '../internal/text.js';
import { resolveStatskontoretUrl } from '../internal/url-guard.js';
 
export const FILE_EXTENSION_RE = /\.(xlsx|xls|csv|zip|docx|pdf)(?:$|[?#])/i;
export const HREF_RE = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
export const TAG_RE = /<[^>]+>/g;
 
export function extractStatskontoretDownloadLinks(
  html: string,
  source: StatskontoretSourceKey,
  sourcePage: string,
  baseURL: string = STATSKONTORET_BASE_URL,
): StatskontoretDownloadLink[] {
  const links: StatskontoretDownloadLink[] = [];
  const pageUpdatedAt = extractPageLastModified(html);
  for (const match of html.matchAll(HREF_RE)) {
    const href = decodeHtml(match[1] ?? '').trim();
    const text = normalizeWhitespace(decodeHtml((match[2] ?? '').replace(TAG_RE, ' ')));
    Iif (!href) continue;
    const resourceType = classifyStatskontoretResource(href, text);
    Iif (resourceType === 'unknown') continue;
    const url = resolveStatskontoretUrl(href, baseURL);
    const parsed = new URL(url);
    const year = parseStatskontoretOptionalInt(parsed.searchParams.get('Year'));
    const month = parseStatskontoretOptionalInt(parsed.searchParams.get('month'));
    links.push({
      source,
      sourcePage,
      href,
      url,
      text,
      resourceType,
      ...(parsed.searchParams.get('documentType')
        ? { documentType: parsed.searchParams.get('documentType') ?? undefined }
        : {}),
      ...(parsed.searchParams.get('fileType')
        ? { fileType: parsed.searchParams.get('fileType') ?? undefined }
        : {}),
      ...(parsed.searchParams.get('fileName')
        ? { fileName: parsed.searchParams.get('fileName') ?? undefined }
        : {}),
      ...(year !== undefined ? { year } : {}),
      ...(month !== undefined ? { month } : {}),
      ...(parsed.searchParams.get('status')
        ? { status: parsed.searchParams.get('status') ?? undefined }
        : {}),
      ...(pageUpdatedAt ? { updatedAt: pageUpdatedAt } : {}),
    });
  }
  return deduplicateLinks(links);
}
 
export function classifyStatskontoretResource(
  href: string,
  text: string,
): StatskontoretResourceType {
  const haystack = `${href} ${text}`.toLowerCase();
  if (
    haystack.includes('filetype=excel') ||
    /\.xlsx(?:$|[?#])/i.test(href) ||
    /\bexcel\b/i.test(text)
  ) {
    return 'excel';
  }
  if (haystack.includes('filetype=zip') && /\bcsv\b/i.test(text)) return 'csv-zip';
  Iif (/\.zip(?:$|[?#])/i.test(href)) return /\bcsv\b/i.test(text) ? 'csv-zip' : 'zip';
  Iif (/\b(csv|zip)\b/i.test(text) && href.includes('GetFile')) return 'csv-zip';
  if (/\.(docx|pdf)(?:$|[?#])/i.test(href)) return 'document';
  Iif (FILE_EXTENSION_RE.test(href) || href.includes('GetFile')) return 'unknown';
  return 'unknown';
}
 
function deduplicateLinks(
  links: readonly StatskontoretDownloadLink[],
): StatskontoretDownloadLink[] {
  const seen = new Set<string>();
  const out: StatskontoretDownloadLink[] = [];
  for (const link of links) {
    if (seen.has(link.url)) continue;
    seen.add(link.url);
    out.push(link);
  }
  return out;
}
 
function extractPageLastModified(html: string): string | undefined {
  const match = /<meta\s+name=["']last-modified["']\s+content=["']([^"']+)["']/i.exec(html);
  return match ? decodeHtml(match[1] ?? '') : undefined;
}