data-transformers text-cleaner.ts

0% Statements 0/16
0% Branches 0/9
0% Functions 0/2
0% Lines 0/14
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  /**
 * @module data-transformers/text-cleaner
 * @description Prose-hygiene filter for summary text passing through the
 *              article-generation pipeline. Strips the residual extraction
 *              artifacts that the existing {@link stripRiksdagRawDump} pass
 *              does not catch, namely:
 *
 *                - `&nbsp;` entity noise
 *                - `#page_\d+` and `#id_\d+` inline anchors
 *                - Repeated "Proposition Proposition …" stutters (and the
 *                  generic same-word-3-plus-times stutter pattern)
 *                - Leading numeric `<dok-id> HD<...> YYYY/YY NNN (prop|mot|bet) …`
 *                  metadata prefixes that survived upstream stripping
 *                - CSS rule fragments (belt-and-braces on top of
 *                  {@link stripRiksdagRawDump})
 *
 *              This is intentionally a narrow, easily auditable module. For
 *              broader document-text extraction, use
 *              {@link stripRiksdagRawDump} in `./helpers.ts` — this file is
 *              specifically the *final* hygiene pass applied to summary
 *              strings just before they are embedded in article HTML.
 *
 * @see analysis/agentic-workflow-quality-plan §P0-4
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
/**
 * Clean a summary/notis string for display in article HTML.
 *
 * Idempotent: running this twice is equivalent to running it once.
 *
 * @param text - Raw or partially-cleaned summary text
 * @returns Cleaned text safe to embed (after HTML-escaping) in article output
 */
export function cleanSummaryForDisplay(text: string | null | undefined): string {
  if (!text) return '';
  let s = String(text);
 
  // 1. Decode common HTML entities so downstream regexes see real whitespace.
  s = s.replace(/&nbsp;/gi, ' ');
 
  // 2. Strip Riksdag dok-id / metadata prefix that may still precede real prose.
  //    Shape: `5287561 HD03242 2025/26 242 prop prop prop Proposition 2025/26:242`
  //    We strip up to — but not including — the human-readable token that
  //    follows the metadata cluster.
  const DOK_PREFIX = /^\s*\d{6,}\s+HD\S+\s+\d{4}\/\d{2}\s+\d+\s+(?:[a-zäöå]{2,4}\s+){1,10}/i;
  if (DOK_PREFIX.test(s)) {
    s = s.replace(DOK_PREFIX, '');
  }
 
  // 3. Remove inline `#page_\d+` and `#id_\d+` anchors that the PDF→text
  //    extractor leaves embedded in prose.
  s = s.replace(/#(?:page|id)_\d+\b/gi, ' ');
 
  // 4. Strip CSS rule fragments on a best-effort basis. Bounded quantifier
  //    guarantees linear-time matching. Delegates the heavy CSS work to
  //    {@link stripRiksdagRawDump}, but catches residual single rules like
  //    `.p436{text-align:center}` that leak into summary fields directly.
  //    The `\s*` between selector and `{` handles both the compact shape
  //    (`.p436{…}`) produced by Riksdag's HTML-to-text extractor and the
  //    expanded shape (`.page { margin: 0; }`) occasionally present when the
  //    upstream HTML was pretty-printed.
  s = s.replace(/\.[a-z_][a-z0-9_-]{0,80}\s*\{[^{}]{0,400}\}/gi, ' ');
 
  // 5. Collapse same-word stutters repeated ≥ 3 times ("Proposition Proposition
  //    Proposition Utr…" → "Proposition Utr…"). Guards against replacing
  //    legitimate prose — we only collapse when the word repeats back-to-back
  //    three or more times. The token class uses Unicode property escapes:
  //      - `\p{L}` matches any Unicode letter (covers Swedish å/ä/ö and
  //        Latin-script accents)
  //      - `\p{M}` matches combining marks (accents that live in separate
  //        code points from their base letter, e.g. NFD-normalised input)
  //    Punctuation and digits are excluded so the match never crosses a
  //    sentence boundary or hyphenated number.
  s = s.replace(
    /\b([\p{L}][\p{L}\p{M}]{1,40})(?:\s+\1\b){2,}/giu,
    '$1'
  );
 
  // 6. Collapse whitespace and trim.
  s = s.replace(/\s+/g, ' ').trim();
 
  return s;
}
 
/**
 * Convenience predicate: does the text still look like a raw Riksdag dump
 * after cleaning? Callers can use this to decide whether to fall back to a
 * metadata-generated summary instead of emitting the residue.
 */
export function looksLikeRawDump(text: string | null | undefined): boolean {
  if (!text) return false;
  const s = String(text);
  return (
    /^\s*\d{6,}\s+HD\S+\s+\d{4}\/\d{2}\s/.test(s) ||
    /\.[a-z_][a-z0-9_-]{0,80}\s*\{[^{}]{0,400}\}/i.test(s) ||
    /#(?:page|id)_\d+\b/i.test(s)
  );
}