All files / scripts/data-transformers text-cleaner.ts

100% Statements 16/16
100% Branches 9/9
100% Functions 2/2
100% Lines 14/14

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101                                                                        25x 22x     22x           22x 22x 3x         22x                   22x                       22x           22x   22x                 10x 7x 7x            
/**
 * @module data-transformers/text-cleaner
 * @description Prose-hygiene filter for summary text passing through the
 *              article-generation pipeline. Strips the residual extraction
 *              artifacts that the existing {@link stripRiksdagRawDump} pass
 *              does not catch, namely:
 *
 *                - ` ` entity noise
 *                - `#page_\d+` and `#id_\d+` inline anchors
 *                - Repeated "Proposition Proposition …" stutters (and the
 *                  generic same-word-3-plus-times stutter pattern)
 *                - Leading numeric `<dok-id> HD<...> YYYY/YY NNN (prop|mot|bet) …`
 *                  metadata prefixes that survived upstream stripping
 *                - CSS rule fragments (belt-and-braces on top of
 *                  {@link stripRiksdagRawDump})
 *
 *              This is intentionally a narrow, easily auditable module. For
 *              broader document-text extraction, use
 *              {@link stripRiksdagRawDump} in `./helpers.ts` — this file is
 *              specifically the *final* hygiene pass applied to summary
 *              strings just before they are embedded in article HTML.
 *
 * @see analysis/agentic-workflow-quality-plan §P0-4
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
/**
 * Clean a summary/notis string for display in article HTML.
 *
 * Idempotent: running this twice is equivalent to running it once.
 *
 * @param text - Raw or partially-cleaned summary text
 * @returns Cleaned text safe to embed (after HTML-escaping) in article output
 */
export function cleanSummaryForDisplay(text: string | null | undefined): string {
  if (!text) return '';
  let s = String(text);
 
  // 1. Decode common HTML entities so downstream regexes see real whitespace.
  s = s.replace(/&nbsp;/gi, ' ');
 
  // 2. Strip Riksdag dok-id / metadata prefix that may still precede real prose.
  //    Shape: `5287561 HD03242 2025/26 242 prop prop prop Proposition 2025/26:242`
  //    We strip up to — but not including — the human-readable token that
  //    follows the metadata cluster.
  const DOK_PREFIX = /^\s*\d{6,}\s+HD\S+\s+\d{4}\/\d{2}\s+\d+\s+(?:[a-zäöå]{2,4}\s+){1,10}/i;
  if (DOK_PREFIX.test(s)) {
    s = s.replace(DOK_PREFIX, '');
  }
 
  // 3. Remove inline `#page_\d+` and `#id_\d+` anchors that the PDF→text
  //    extractor leaves embedded in prose.
  s = s.replace(/#(?:page|id)_\d+\b/gi, ' ');
 
  // 4. Strip CSS rule fragments on a best-effort basis. Bounded quantifier
  //    guarantees linear-time matching. Delegates the heavy CSS work to
  //    {@link stripRiksdagRawDump}, but catches residual single rules like
  //    `.p436{text-align:center}` that leak into summary fields directly.
  //    The `\s*` between selector and `{` handles both the compact shape
  //    (`.p436{…}`) produced by Riksdag's HTML-to-text extractor and the
  //    expanded shape (`.page { margin: 0; }`) occasionally present when the
  //    upstream HTML was pretty-printed.
  s = s.replace(/\.[a-z_][a-z0-9_-]{0,80}\s*\{[^{}]{0,400}\}/gi, ' ');
 
  // 5. Collapse same-word stutters repeated ≥ 3 times ("Proposition Proposition
  //    Proposition Utr…" → "Proposition Utr…"). Guards against replacing
  //    legitimate prose — we only collapse when the word repeats back-to-back
  //    three or more times. The token class uses Unicode property escapes:
  //      - `\p{L}` matches any Unicode letter (covers Swedish å/ä/ö and
  //        Latin-script accents)
  //      - `\p{M}` matches combining marks (accents that live in separate
  //        code points from their base letter, e.g. NFD-normalised input)
  //    Punctuation and digits are excluded so the match never crosses a
  //    sentence boundary or hyphenated number.
  s = s.replace(
    /\b([\p{L}][\p{L}\p{M}]{1,40})(?:\s+\1\b){2,}/giu,
    '$1'
  );
 
  // 6. Collapse whitespace and trim.
  s = s.replace(/\s+/g, ' ').trim();
 
  return s;
}
 
/**
 * Convenience predicate: does the text still look like a raw Riksdag dump
 * after cleaning? Callers can use this to decide whether to fall back to a
 * metadata-generated summary instead of emitting the residue.
 */
export function looksLikeRawDump(text: string | null | undefined): boolean {
  if (!text) return false;
  const s = String(text);
  return (
    /^\s*\d{6,}\s+HD\S+\s+\d{4}\/\d{2}\s/.test(s) ||
    /\.[a-z_][a-z0-9_-]{0,80}\s*\{[^{}]{0,400}\}/i.test(s) ||
    /#(?:page|id)_\d+\b/i.test(s)
  );
}