generate-news-enhanced url-utils.ts

0% Statements 0/46
0% Branches 0/32
0% Functions 0/7
0% Lines 0/44
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  /**
 * @module generate-news-enhanced/url-utils
 * @description URL parsing and text sanitization utilities for the
 * deep-inspection article generator. Provides Riksdag/government URL
 * extraction, GitHub raw URL conversion, and XSS-safe text cleaning.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
/**
 * Extract a `dok_id` from a Riksdag or data.riksdagen.se document URL.
 * Returns `null` if the URL is not a recognised Riksdag document URL.
 *
 * Supported patterns:
 * - `https://riksdagen.se/sv/dokument-och-lagar/dokument/{type}/{dok_id}`
 * - `https://data.riksdagen.se/dokument/{dok_id}[.json|.xml|.html]`
 */
export function extractDocIdFromUrl(url: string): string | null {
  try {
    const parsed = new URL(url);
    const hostname = parsed.hostname.toLowerCase();
    const segments = parsed.pathname.split('/').filter(Boolean);
 
    // https://riksdagen.se/sv/dokument-och-lagar/dokument/{type}/{dok_id}
    if (hostname === 'riksdagen.se' || hostname === 'www.riksdagen.se') {
      const dokIdx = segments.indexOf('dokument');
      if (dokIdx >= 0 && segments.length > dokIdx + 2) {
        return segments[dokIdx + 2];
      }
    }
 
    // https://data.riksdagen.se/dokument/{dok_id}[.json|.xml|.html]
    if (hostname === 'data.riksdagen.se') {
      const dokIdx = segments.indexOf('dokument');
      if (dokIdx >= 0 && segments.length > dokIdx + 1) {
        return segments[dokIdx + 1].replace(/\.(json|xml|html|pdf)$/i, ''); // strip known file extensions
      }
    }
 
    return null;
  } catch {
    return null;
  }
}
 
/**
 * Determine whether a URL points to a government (regeringen.se) resource
 * that can be fetched via the get_g0v_document_content MCP tool.
 */
export function isGovernmentUrl(url: string): boolean {
  try {
    const parsed = new URL(url);
    const hostname = parsed.hostname.toLowerCase();
    return hostname === 'regeringen.se' || hostname === 'www.regeringen.se';
  } catch {
    return false;
  }
}
 
/**
 * Determine whether a URL points to a GitHub repository resource
 * (github.com or raw.githubusercontent.com) that can be fetched as raw content.
 */
export function isGitHubUrl(url: string): boolean {
  try {
    const parsed = new URL(url);
    const hostname = parsed.hostname.toLowerCase();
    return hostname === 'github.com'
      || hostname === 'www.github.com'
      || hostname === 'raw.githubusercontent.com';
  } catch {
    return false;
  }
}
 
/**
 * Convert a GitHub blob/tree/raw URL to a raw.githubusercontent.com URL.
 * Handles patterns like:
 *   - https://github.com/{owner}/{repo}/blob/{branch}/{path}
 *   - https://github.com/{owner}/{repo}/tree/{branch}/{path}
 *   - https://github.com/{owner}/{repo}/raw/{branch}/{path}
 *   - https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path} (returned as-is)
 *
 * @returns The raw URL, or null if the URL cannot be converted.
 */
export function toGitHubRawUrl(url: string): string | null {
  try {
    const parsed = new URL(url);
    const hostname = parsed.hostname.toLowerCase();
 
    // Already a raw URL — return as-is
    if (hostname === 'raw.githubusercontent.com') {
      return url;
    }
 
    if (hostname !== 'github.com' && hostname !== 'www.github.com') {
      return null;
    }
 
    // Path: /{owner}/{repo}/blob/{branch}/{...path}
    // or:   /{owner}/{repo}/tree/{branch}/{...path}
    // or:   /{owner}/{repo}/raw/{branch}/{...path}
    const segments = parsed.pathname.split('/').filter(Boolean);
    if (segments.length < 4) return null;
 
    const [owner, repo, refType, ...rest] = segments;
    if (refType !== 'blob' && refType !== 'raw' && refType !== 'tree') return null;
 
    // rest = [branch, ...pathParts]
    return `https://raw.githubusercontent.com/${owner}/${repo}/${rest.join('/')}`;
  } catch {
    return null;
  }
}
 
/**
 * Compute a short, deterministic hash suffix from a URL path string.
 * Used to generate collision-resistant `dok_id` values for documents
 * fetched from government or GitHub URLs.
 *
 * The hash is a simple DJB2-style left-shift-and-add over each character,
 * rendered in base-36. A leading `-` (from negative ints) is replaced with `n`.
 */
export function hashPathSuffix(path: string): string {
  return path
    .split('')
    .reduce((a, c) => ((a << 5) - a + c.charCodeAt(0)) | 0, 0)
    .toString(36)
    .replace(/^-/, 'n');
}
 
/**
 * Strip HTML tags from a user-supplied string to prevent XSS.
 * Uses a multi-pass loop to handle nested tag reconstruction attempts
 * (e.g. `<scr<script>ipt>`). Returns **plain text** — callers must
 * apply `escapeHtml()` at their render sites so escaping happens exactly once.
 */
export function sanitizePlainText(text: string): string {
  let cleaned = text;
  let prev: string;
  do {
    prev = cleaned;
    cleaned = cleaned.replace(/<[^>]*>/g, '');
  } while (cleaned !== prev);
  return cleaned;
}