backfill-lib report-writer.ts

100% Statements 28/28
63.63% Branches 14/22
100% Functions 4/4
100% Lines 27/27
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357x
357x
357x
23x
 
 
 
 
35x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11x
11x
11x
15x
15x
1x
 
 
 
 
 
 
 
 
 
 
 
1x
 
14x
40x
 
 
 
 
 
 
 
 
 
 
 
 
 
11x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8x
8x
8x
8x
8x
8x
34x
34x
 
 
8x
 
8x
 
 
1x
 
 
  /**
 * @module Infrastructure/BackfillLib/ReportWriter
 * @category Intelligence Operations / Supporting Infrastructure
 * @name RFC 4180 CSV writer for the metadata backfill diff report
 *
 * @description
 * Emits the CSV contract documented in the PR 2 issue:
 *
 * ```
 * file_path,date,subfolder,lang,tier,field,violation_code,before,after,reason
 * ```
 *
 * Each row represents one (tier, violation) pair. An article with `T`
 * qualifying tiers and `V` violations emits `T * V` rows; an article
 * with `T` tiers and **zero** violations still emits `T` rows (one
 * tier-only row per tier, with `field` / `violation_code` / `before`
 * blank) so reviewers can audit the tier assignment for green
 * articles. `after` is intentionally blank in PR 2 — PRs 3/4/5 will
 * populate it with the planned post-backfill value when they run.
 *
 * Quoting follows RFC 4180 §2.6: fields containing `,`, `"`, CR or LF
 * are double-quoted, and embedded `"` is doubled to `""`. Line endings
 * are `\n` (not CRLF) to match the rest of the repo.
 *
 * @author Hack23 AB (Infrastructure Team)
 * @license Apache-2.0
 */
 
import fs from 'fs';
import path from 'path';
 
import type { ContractViolation } from './contract-checker.js';
import type {
  ArticleFingerprint,
  ClassificationResult,
  Tier,
} from './classifier.js';
 
/** CSV column order — keep in sync with the issue spec. */
export const CSV_COLUMNS = [
  'file_path',
  'date',
  'subfolder',
  'lang',
  'tier',
  'field',
  'violation_code',
  'before',
  'after',
  'reason',
] as const;
 
/** One logical report row — serialised to CSV by {@link writeReport}. */
export interface ReportRow {
  readonly filePath: string;
  readonly date: string;
  readonly subfolder: string;
  readonly lang: string;
  readonly tier: Tier | '';
  readonly field: string;
  readonly violationCode: string;
  readonly before: string;
  readonly after: string;
  readonly reason: string;
}
 
/** Quote a single CSV field per RFC 4180. */
export function quoteField(value: string | null | undefined): string {
  const s = value ?? '';
  const needsQuoting = /[",\r\n]/.test(s);
  if (!needsQuoting) return s;
  return `"${s.replace(/"/g, '""')}"`;
}
 
/** Serialise a row to its CSV line (no trailing newline). */
export function serialiseRow(row: ReportRow): string {
  return [
    quoteField(row.filePath),
    quoteField(row.date),
    quoteField(row.subfolder),
    quoteField(row.lang),
    quoteField(row.tier),
    quoteField(row.field),
    quoteField(row.violationCode),
    quoteField(row.before),
    quoteField(row.after),
    quoteField(row.reason),
  ].join(',');
}
 
/**
 * Build the set of rows emitted for one article. An article with
 * multiple tiers and multiple violations produces `tiers.length *
 * max(1, violations.length)` rows (one per (tier, violation) pair),
 * so the reviewer can trace every violation back to the tier that
 * will fix it.
 *
 * An article with zero violations still emits one row per tier so
 * reviewers can sanity-check the tier assignment even when everything
 * is green.
 */
export function rowsForArticle(
  fp: ArticleFingerprint,
  classification: ClassificationResult,
  violations: readonly ContractViolation[],
): readonly ReportRow[] {
  const rows: ReportRow[] = [];
  const tiers = classification.tiers.length > 0 ? classification.tiers : [''];
  for (const tier of tiers) {
    const tierReason = tier ? classification.reasons[tier as Tier] ?? '' : '';
    if (violations.length === 0) {
      rows.push({
        filePath: fp.relPath,
        date: fp.date ?? '',
        subfolder: fp.subfolder ?? '',
        lang: fp.lang,
        tier: tier as Tier | '',
        field: '',
        violationCode: '',
        before: '',
        after: '',
        reason: tierReason,
      });
      continue;
    }
    for (const v of violations) {
      rows.push({
        filePath: fp.relPath,
        date: fp.date ?? '',
        subfolder: fp.subfolder ?? '',
        lang: fp.lang,
        tier: tier as Tier | '',
        field: v.field,
        violationCode: v.code,
        before: v.value,
        after: '',
        reason: tierReason ? `${tierReason}; ${v.message}` : v.message,
      });
    }
  }
  return rows;
}
 
/**
 * Write the full report to `outputPath`. Creates any missing parent
 * directories. Returns the number of rows written (excluding the
 * header).
 *
 * Rows are streamed to disk one line at a time via a synchronous file
 * descriptor, so peak memory stays bounded even for thousands of files
 * × multiple tiers/violations. Output bytes are unchanged versus the
 * previous all-in-memory implementation: header + `\n` + serialised
 * rows joined by `\n`, with a trailing `\n` only when at least one row
 * is present.
 */
export function writeReport(
  outputPath: string,
  rows: readonly ReportRow[],
): number {
  fs.mkdirSync(path.dirname(outputPath), { recursive: true });
  const fd = fs.openSync(outputPath, 'w');
  try {
    fs.writeSync(fd, CSV_COLUMNS.join(','));
    fs.writeSync(fd, '\n');
    for (const row of rows) {
      fs.writeSync(fd, serialiseRow(row));
      fs.writeSync(fd, '\n');
    }
  } finally {
    fs.closeSync(fd);
  }
  return rows.length;
}
 
export const __test__ = {
  CSV_COLUMNS,
};