extractors download-links.ts

90.69% Statements 39/43
75% Branches 45/60
100% Functions 4/4
100% Lines 35/35
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4x
4x
4x
 
 
 
 
 
 
 
4x
4x
4x
7x
7x
7x
7x
7x
7x
7x
7x
7x
7x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4x
 
 
 
 
 
 
11x
11x
 
 
 
 
7x
 
4x
2x
2x
2x
1x
1x
 
 
 
 
 
4x
4x
4x
7x
6x
6x
 
4x
 
 
 
4x
4x
 
  /**
 * @module scripts/statskontoret/extractors/download-links
 * @description Extract downloadable Excel/CSV-ZIP/document links from a
 * Statskontoret open-data HTML page, with provenance attributes.
 *
 * Defensive regex-based scraper — no external HTML parser — matches the
 * style of the rest of the Statskontoret client. The classifier mirrors
 * the firewall allowlist's file-extension allow-list.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { STATSKONTORET_BASE_URL } from '../source-registry.js';
import type {
  StatskontoretDownloadLink,
  StatskontoretResourceType,
  StatskontoretSourceKey,
} from '../types.js';
import {
  decodeHtml,
  normalizeWhitespace,
  parseStatskontoretOptionalInt,
} from '../internal/text.js';
import { resolveStatskontoretUrl } from '../internal/url-guard.js';
 
export const FILE_EXTENSION_RE = /\.(xlsx|xls|csv|zip|docx|pdf)(?:$|[?#])/i;
export const HREF_RE = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
export const TAG_RE = /<[^>]+>/g;
 
export function extractStatskontoretDownloadLinks(
  html: string,
  source: StatskontoretSourceKey,
  sourcePage: string,
  baseURL: string = STATSKONTORET_BASE_URL,
): StatskontoretDownloadLink[] {
  const links: StatskontoretDownloadLink[] = [];
  const pageUpdatedAt = extractPageLastModified(html);
  for (const match of html.matchAll(HREF_RE)) {
    const href = decodeHtml(match[1] ?? '').trim();
    const text = normalizeWhitespace(decodeHtml((match[2] ?? '').replace(TAG_RE, ' ')));
    Iif (!href) continue;
    const resourceType = classifyStatskontoretResource(href, text);
    Iif (resourceType === 'unknown') continue;
    const url = resolveStatskontoretUrl(href, baseURL);
    const parsed = new URL(url);
    const year = parseStatskontoretOptionalInt(parsed.searchParams.get('Year'));
    const month = parseStatskontoretOptionalInt(parsed.searchParams.get('month'));
    links.push({
      source,
      sourcePage,
      href,
      url,
      text,
      resourceType,
      ...(parsed.searchParams.get('documentType')
        ? { documentType: parsed.searchParams.get('documentType') ?? undefined }
        : {}),
      ...(parsed.searchParams.get('fileType')
        ? { fileType: parsed.searchParams.get('fileType') ?? undefined }
        : {}),
      ...(parsed.searchParams.get('fileName')
        ? { fileName: parsed.searchParams.get('fileName') ?? undefined }
        : {}),
      ...(year !== undefined ? { year } : {}),
      ...(month !== undefined ? { month } : {}),
      ...(parsed.searchParams.get('status')
        ? { status: parsed.searchParams.get('status') ?? undefined }
        : {}),
      ...(pageUpdatedAt ? { updatedAt: pageUpdatedAt } : {}),
    });
  }
  return deduplicateLinks(links);
}
 
export function classifyStatskontoretResource(
  href: string,
  text: string,
): StatskontoretResourceType {
  const haystack = `${href} ${text}`.toLowerCase();
  if (
    haystack.includes('filetype=excel') ||
    /\.xlsx(?:$|[?#])/i.test(href) ||
    /\bexcel\b/i.test(text)
  ) {
    return 'excel';
  }
  if (haystack.includes('filetype=zip') && /\bcsv\b/i.test(text)) return 'csv-zip';
  Iif (/\.zip(?:$|[?#])/i.test(href)) return /\bcsv\b/i.test(text) ? 'csv-zip' : 'zip';
  Iif (/\b(csv|zip)\b/i.test(text) && href.includes('GetFile')) return 'csv-zip';
  if (/\.(docx|pdf)(?:$|[?#])/i.test(href)) return 'document';
  Iif (FILE_EXTENSION_RE.test(href) || href.includes('GetFile')) return 'unknown';
  return 'unknown';
}
 
function deduplicateLinks(
  links: readonly StatskontoretDownloadLink[],
): StatskontoretDownloadLink[] {
  const seen = new Set<string>();
  const out: StatskontoretDownloadLink[] = [];
  for (const link of links) {
    if (seen.has(link.url)) continue;
    seen.add(link.url);
    out.push(link);
  }
  return out;
}
 
function extractPageLastModified(html: string): string | undefined {
  const match = /<meta\s+name=["']last-modified["']\s+content=["']([^"']+)["']/i.exec(html);
  return match ? decodeHtml(match[1] ?? '') : undefined;
}