parsers xlsx.ts

93.05% Statements 67/72
62.5% Branches 30/48
100% Functions 9/9
100% Lines 62/62
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5x
5x
5x
5x
 
 
5x
5x
5x
 
5x
6x
6x
6x
6x
6x
 
 
5x
 
 
 
5x
5x
5x
6x
6x
6x
6x
 
5x
 
 
 
5x
5x
5x
6x
6x
6x
6x
 
5x
 
 
 
5x
4x
4x
4x
33x
 
4x
 
 
 
6x
6x
6x
20x
20x
20x
70x
70x
70x
70x
 
71x
 
6x
 
 
 
 
 
 
 
70x
70x
70x
70x
28x
 
 
 
70x
70x
70x
70x
70x
 
70x
 
 
 
20x
20x
20x
 
  /**
 * @module scripts/statskontoret/parsers/xlsx
 * @description Defensive XLSX workbook parser for Statskontoret workbooks.
 *
 * Operates directly on the OOXML zip envelope so the client doesn't depend
 * on a full SpreadsheetML library — only `jszip` is required.  Isolating
 * the parser makes it easy to add fuzz tests against malformed workbooks.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import JSZip from 'jszip';
 
import { StatskontoretError } from '../errors.js';
import type { StatskontoretSheet, StatskontoretWorkbook } from '../types.js';
import {
  decodeXml,
  extractTextNodes,
  firstXmlTagValue,
  parseXmlAttributes,
} from '../internal/text.js';
 
export async function parseStatskontoretXlsx(
  input: ArrayBuffer | Uint8Array,
): Promise<StatskontoretWorkbook> {
  const zip = await JSZip.loadAsync(input);
  const workbookXml = await readZipText(zip, 'xl/workbook.xml');
  const workbookRelsXml = await readZipText(zip, 'xl/_rels/workbook.xml.rels');
  const sharedStringsXml = zip.file('xl/sharedStrings.xml')
    ? await readZipText(zip, 'xl/sharedStrings.xml')
    : '';
  const sharedStrings = parseSharedStrings(sharedStringsXml);
  const rels = parseWorkbookRelationships(workbookRelsXml);
  const sheets: StatskontoretSheet[] = [];
 
  for (const sheet of parseWorkbookSheets(workbookXml)) {
    const target = rels.get(sheet.relationshipId);
    Iif (!target) continue;
    const sheetPath = target.startsWith('/') ? target.slice(1) : `xl/${target}`;
    const sheetXml = await readZipText(zip, sheetPath.replace(/\/\.\//g, '/'));
    sheets.push({ name: sheet.name, rows: parseWorksheetRows(sheetXml, sharedStrings) });
  }
 
  return { sheets };
}
 
function parseWorkbookSheets(xml: string): Array<{ name: string; relationshipId: string }> {
  const sheets: Array<{ name: string; relationshipId: string }> = [];
  const sheetRe = /<sheet\b([^>]*)\/>/gi;
  for (const match of xml.matchAll(sheetRe)) {
    const attrs = parseXmlAttributes(match[1] ?? '');
    const name = attrs.get('name');
    const relationshipId = attrs.get('r:id') ?? attrs.get('id');
    Eif (name && relationshipId) sheets.push({ name: decodeXml(name), relationshipId });
  }
  return sheets;
}
 
function parseWorkbookRelationships(xml: string): Map<string, string> {
  const rels = new Map<string, string>();
  const relRe = /<Relationship\b([^>]*)\/>/gi;
  for (const match of xml.matchAll(relRe)) {
    const attrs = parseXmlAttributes(match[1] ?? '');
    const id = attrs.get('Id');
    const target = attrs.get('Target');
    Eif (id && target) rels.set(id, target);
  }
  return rels;
}
 
function parseSharedStrings(xml: string): string[] {
  if (!xml) return [];
  const strings: string[] = [];
  const siRe = /<si\b[^>]*>([\s\S]*?)<\/si>/gi;
  for (const match of xml.matchAll(siRe)) {
    strings.push(extractTextNodes(match[1] ?? ''));
  }
  return strings;
}
 
function parseWorksheetRows(xml: string, sharedStrings: readonly string[]): string[][] {
  const rows: string[][] = [];
  const rowRe = /<row\b[^>]*>([\s\S]*?)<\/row>/gi;
  for (const rowMatch of xml.matchAll(rowRe)) {
    const row: string[] = [];
    const cellRe = /<c\b([^>]*)>([\s\S]*?)<\/c>/gi;
    for (const cellMatch of (rowMatch[1] ?? '').matchAll(cellRe)) {
      const attrs = parseXmlAttributes(cellMatch[1] ?? '');
      const ref = attrs.get('r') ?? '';
      const cellIndex = cellRefToColumnIndex(ref) ?? row.length;
      row[cellIndex] = parseCellValue(cellMatch[2] ?? '', attrs.get('t'), sharedStrings);
    }
    rows.push(Array.from({ length: row.length }, (_, i) => row[i] ?? ''));
  }
  return rows;
}
 
function parseCellValue(
  xml: string,
  type: string | undefined,
  sharedStrings: readonly string[],
): string {
  Iif (type === 'inlineStr') return extractTextNodes(xml);
  const value = firstXmlTagValue(xml, 'v');
  Iif (value === undefined) return '';
  if (type === 's') return sharedStrings[Number.parseInt(value, 10)] ?? '';
  return decodeXml(value);
}
 
function cellRefToColumnIndex(ref: string): number | undefined {
  const letters = ref.match(/^[A-Z]+/i)?.[0];
  Iif (!letters) return undefined;
  let index = 0;
  for (const char of letters.toUpperCase()) {
    index = index * 26 + (char.charCodeAt(0) - 65 + 1);
  }
  return index - 1;
}
 
async function readZipText(zip: JSZip, path: string): Promise<string> {
  const file = zip.file(path);
  Iif (!file) throw new StatskontoretError(`Statskontoret workbook missing ${path}`, 'workbook');
  return file.async('string');
}