Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | 4x 4x 4x 4x 4x 4x 7x 7x 7x 7x 7x 7x 7x 7x 7x 7x 4x 11x 11x 7x 4x 2x 2x 2x 1x 1x 4x 4x 4x 7x 6x 6x 4x 4x 4x | /**
* @module scripts/statskontoret/extractors/download-links
* @description Extract downloadable Excel/CSV-ZIP/document links from a
* Statskontoret open-data HTML page, with provenance attributes.
*
* Defensive regex-based scraper — no external HTML parser — matches the
* style of the rest of the Statskontoret client. The classifier mirrors
* the firewall allowlist's file-extension allow-list.
*
* @author Hack23 AB
* @license Apache-2.0
*/
import { STATSKONTORET_BASE_URL } from '../source-registry.js';
import type {
StatskontoretDownloadLink,
StatskontoretResourceType,
StatskontoretSourceKey,
} from '../types.js';
import {
decodeHtml,
normalizeWhitespace,
parseStatskontoretOptionalInt,
} from '../internal/text.js';
import { resolveStatskontoretUrl } from '../internal/url-guard.js';
export const FILE_EXTENSION_RE = /\.(xlsx|xls|csv|zip|docx|pdf)(?:$|[?#])/i;
export const HREF_RE = /<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
export const TAG_RE = /<[^>]+>/g;
export function extractStatskontoretDownloadLinks(
html: string,
source: StatskontoretSourceKey,
sourcePage: string,
baseURL: string = STATSKONTORET_BASE_URL,
): StatskontoretDownloadLink[] {
const links: StatskontoretDownloadLink[] = [];
const pageUpdatedAt = extractPageLastModified(html);
for (const match of html.matchAll(HREF_RE)) {
const href = decodeHtml(match[1] ?? '').trim();
const text = normalizeWhitespace(decodeHtml((match[2] ?? '').replace(TAG_RE, ' ')));
Iif (!href) continue;
const resourceType = classifyStatskontoretResource(href, text);
Iif (resourceType === 'unknown') continue;
const url = resolveStatskontoretUrl(href, baseURL);
const parsed = new URL(url);
const year = parseStatskontoretOptionalInt(parsed.searchParams.get('Year'));
const month = parseStatskontoretOptionalInt(parsed.searchParams.get('month'));
links.push({
source,
sourcePage,
href,
url,
text,
resourceType,
...(parsed.searchParams.get('documentType')
? { documentType: parsed.searchParams.get('documentType') ?? undefined }
: {}),
...(parsed.searchParams.get('fileType')
? { fileType: parsed.searchParams.get('fileType') ?? undefined }
: {}),
...(parsed.searchParams.get('fileName')
? { fileName: parsed.searchParams.get('fileName') ?? undefined }
: {}),
...(year !== undefined ? { year } : {}),
...(month !== undefined ? { month } : {}),
...(parsed.searchParams.get('status')
? { status: parsed.searchParams.get('status') ?? undefined }
: {}),
...(pageUpdatedAt ? { updatedAt: pageUpdatedAt } : {}),
});
}
return deduplicateLinks(links);
}
export function classifyStatskontoretResource(
href: string,
text: string,
): StatskontoretResourceType {
const haystack = `${href} ${text}`.toLowerCase();
if (
haystack.includes('filetype=excel') ||
/\.xlsx(?:$|[?#])/i.test(href) ||
/\bexcel\b/i.test(text)
) {
return 'excel';
}
if (haystack.includes('filetype=zip') && /\bcsv\b/i.test(text)) return 'csv-zip';
Iif (/\.zip(?:$|[?#])/i.test(href)) return /\bcsv\b/i.test(text) ? 'csv-zip' : 'zip';
Iif (/\b(csv|zip)\b/i.test(text) && href.includes('GetFile')) return 'csv-zip';
if (/\.(docx|pdf)(?:$|[?#])/i.test(href)) return 'document';
Iif (FILE_EXTENSION_RE.test(href) || href.includes('GetFile')) return 'unknown';
return 'unknown';
}
function deduplicateLinks(
links: readonly StatskontoretDownloadLink[],
): StatskontoretDownloadLink[] {
const seen = new Set<string>();
const out: StatskontoretDownloadLink[] = [];
for (const link of links) {
if (seen.has(link.url)) continue;
seen.add(link.url);
out.push(link);
}
return out;
}
function extractPageLastModified(html: string): string | undefined {
const match = /<meta\s+name=["']last-modified["']\s+content=["']([^"']+)["']/i.exec(html);
return match ? decodeHtml(match[1] ?? '') : undefined;
}
|