All files / scripts html-utils.ts

91.3% Statements 21/23
75% Branches 6/8
100% Functions 6/6
90.47% Lines 19/21

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95                  42x               42x                   61454x 61363x   84014x         42x                                                     175568x   169170x     169170x 320x 320x 320x             169170x 46x 46x 46x             169170x   871x     169170x    
/**
 * @module Infrastructure/HTMLSanitization
 * @description XSS-safe HTML entity escaping and decoding utilities.
 * Bounded context: Infrastructure / Security
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
const HTML_ENTITY_MAP: Readonly<Record<string, string>> = {
  '&': '&amp;',
  '<': '&lt;',
  '>': '&gt;',
  '"': '&quot;',
  "'": '&#039;',
} as const;
 
const HTML_ESCAPE_PATTERN = /[&<>"']/g;
 
/**
 * Escape HTML special characters for safe inclusion in HTML/JSON-LD.
 * Prevents XSS by converting &, <, >, ", ' to their HTML entity equivalents.
 *
 * @param text - Raw text to escape
 * @returns Escaped text safe for HTML insertion; empty string for falsy input
 */
export function escapeHtml(text: string | null | undefined | number): string {
  if (!text) return '';
  return String(text).replace(
    HTML_ESCAPE_PATTERN,
    (m: string): string => HTML_ENTITY_MAP[m] ?? m,
  );
}
 
/** Map of named HTML entities to their UTF-8 characters. */
const NAMED_ENTITY_MAP: Readonly<Record<string, string>> = {
  '&amp;': '&',
  '&lt;': '<',
  '&gt;': '>',
  '&quot;': '"',
  '&apos;': "'",
  '&nbsp;': '\u00a0',
  '&mdash;': '—',
  '&ndash;': '–',
  '&lsquo;': '\u2018',
  '&rsquo;': '\u2019',
  '&ldquo;': '\u201c',
  '&rdquo;': '\u201d',
  '&bull;': '•',
} as const;
 
/**
 * Decode HTML numeric and named entities to their UTF-8 characters.
 * Converts `&#228;` → `ä`, `&#x00E4;` → `ä`, `&amp;` → `&`, etc.
 *
 * Use this to normalize text extracted from HTML before further processing,
 * preventing double-escaping when the text is later passed through escapeHtml().
 *
 * @param text - Text potentially containing HTML entities
 * @returns Text with entities decoded to UTF-8; empty string for falsy input
 */
export function decodeHtmlEntities(text: string | null | undefined): string {
  if (!text) return '';
 
  let result = String(text);
 
  // Decode numeric entities: &#228; → ä
  result = result.replace(/&#(\d+);/g, (_match: string, code: string): string => {
    const codePoint = parseInt(code, 10);
    try {
      return String.fromCodePoint(codePoint);
    } catch {
      return _match; // Keep invalid entities as-is
    }
  });
 
  // Decode hex entities: &#x00E4; → ä
  result = result.replace(/&#x([0-9a-fA-F]+);/g, (_match: string, hex: string): string => {
    const codePoint = parseInt(hex, 16);
    try {
      return String.fromCodePoint(codePoint);
    } catch {
      return _match;
    }
  });
 
  // Decode named entities
  result = result.replace(
    /&(?:amp|lt|gt|quot|apos|nbsp|mdash|ndash|lsquo|rsquo|ldquo|rdquo|bull);/g,
    (m: string): string => NAMED_ENTITY_MAP[m] ?? m,
  );
 
  return result;
}