Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | 13x 13x 13x 13x 13x 5x 5x 4x 7x 4x 4x 4x 3x 2x 7x 7x 7x 7x 2x 8x 8x 8x 8x 2x 9x 9x 9x 9x 1x 6x 1x 5x 5x 3x 3x 3x 2x 6x 98x 10x 10x 15x 15x 10x | /**
* @module generate-news-enhanced/url-utils
* @description URL parsing and text sanitization utilities for the
* deep-inspection article generator. Provides Riksdag/government URL
* extraction, GitHub raw URL conversion, and XSS-safe text cleaning.
*
* @author Hack23 AB
* @license Apache-2.0
*/
/**
* Extract a `dok_id` from a Riksdag or data.riksdagen.se document URL.
* Returns `null` if the URL is not a recognised Riksdag document URL.
*
* Supported patterns:
* - `https://riksdagen.se/sv/dokument-och-lagar/dokument/{type}/{dok_id}`
* - `https://data.riksdagen.se/dokument/{dok_id}[.json|.xml|.html]`
*/
export function extractDocIdFromUrl(url: string): string | null {
try {
const parsed = new URL(url);
const hostname = parsed.hostname.toLowerCase();
const segments = parsed.pathname.split('/').filter(Boolean);
// https://riksdagen.se/sv/dokument-och-lagar/dokument/{type}/{dok_id}
if (hostname === 'riksdagen.se' || hostname === 'www.riksdagen.se') {
const dokIdx = segments.indexOf('dokument');
if (dokIdx >= 0 && segments.length > dokIdx + 2) {
return segments[dokIdx + 2];
}
}
// https://data.riksdagen.se/dokument/{dok_id}[.json|.xml|.html]
if (hostname === 'data.riksdagen.se') {
const dokIdx = segments.indexOf('dokument');
Eif (dokIdx >= 0 && segments.length > dokIdx + 1) {
return segments[dokIdx + 1].replace(/\.(json|xml|html|pdf)$/i, ''); // strip known file extensions
}
}
return null;
} catch {
return null;
}
}
/**
* Determine whether a URL points to a government (regeringen.se) resource
* that can be fetched via the get_g0v_document_content MCP tool.
*/
export function isGovernmentUrl(url: string): boolean {
try {
const parsed = new URL(url);
const hostname = parsed.hostname.toLowerCase();
return hostname === 'regeringen.se' || hostname === 'www.regeringen.se';
} catch {
return false;
}
}
/**
* Determine whether a URL points to a GitHub repository resource
* (github.com or raw.githubusercontent.com) that can be fetched as raw content.
*/
export function isGitHubUrl(url: string): boolean {
try {
const parsed = new URL(url);
const hostname = parsed.hostname.toLowerCase();
return hostname === 'github.com'
|| hostname === 'www.github.com'
|| hostname === 'raw.githubusercontent.com';
} catch {
return false;
}
}
/**
* Convert a GitHub blob/tree/raw URL to a raw.githubusercontent.com URL.
* Handles patterns like:
* - https://github.com/{owner}/{repo}/blob/{branch}/{path}
* - https://github.com/{owner}/{repo}/tree/{branch}/{path}
* - https://github.com/{owner}/{repo}/raw/{branch}/{path}
* - https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path} (returned as-is)
*
* @returns The raw URL, or null if the URL cannot be converted.
*/
export function toGitHubRawUrl(url: string): string | null {
try {
const parsed = new URL(url);
const hostname = parsed.hostname.toLowerCase();
// Already a raw URL — return as-is
if (hostname === 'raw.githubusercontent.com') {
return url;
}
if (hostname !== 'github.com' && hostname !== 'www.github.com') {
return null;
}
// Path: /{owner}/{repo}/blob/{branch}/{...path}
// or: /{owner}/{repo}/tree/{branch}/{...path}
// or: /{owner}/{repo}/raw/{branch}/{...path}
const segments = parsed.pathname.split('/').filter(Boolean);
if (segments.length < 4) return null;
const [owner, repo, refType, ...rest] = segments;
Iif (refType !== 'blob' && refType !== 'raw' && refType !== 'tree') return null;
// rest = [branch, ...pathParts]
return `https://raw.githubusercontent.com/${owner}/${repo}/${rest.join('/')}`;
} catch {
return null;
}
}
/**
* Compute a short, deterministic hash suffix from a URL path string.
* Used to generate collision-resistant `dok_id` values for documents
* fetched from government or GitHub URLs.
*
* The hash is a simple DJB2-style left-shift-and-add over each character,
* rendered in base-36. A leading `-` (from negative ints) is replaced with `n`.
*/
export function hashPathSuffix(path: string): string {
return path
.split('')
.reduce((a, c) => ((a << 5) - a + c.charCodeAt(0)) | 0, 0)
.toString(36)
.replace(/^-/, 'n');
}
/**
* Strip HTML tags from a user-supplied string to prevent XSS.
* Uses a multi-pass loop to handle nested tag reconstruction attempts
* (e.g. `<scr<script>ipt>`). Returns **plain text** — callers must
* apply `escapeHtml()` at their render sites so escaping happens exactly once.
*/
export function sanitizePlainText(text: string): string {
let cleaned = text;
let prev: string;
do {
prev = cleaned;
cleaned = cleaned.replace(/<[^>]*>/g, '');
} while (cleaned !== prev);
return cleaned;
}
|