rules per-document.ts

96.55% Statements 28/29
90% Branches 18/20
100% Functions 2/2
100% Lines 27/27
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2x
 
 
 
 
 
 
 
 
 
2x
2x
2x
2x
2x
2x
 
2x
2x
2x
2x
3x
3x
3x
3x
3x
3x
1x
1x
 
2x
 
 
 
2x
 
 
 
 
 
1x
1x
1x
1x
1x
 
 
 
 
 
 
1x
 
  /**
 * @module scripts/validators/article/rules/per-document
 * @description Per-document subsection extractor + minimum dok_id
 *              citation threshold.
 *
 *              Rule census: extracted from
 *              `scripts/validate-article.ts` lines 108
 *              (`MIN_PER_DOC_DOK_ID_HITS`) and 187–209
 *              (`extractPerDocumentSections`). Logic is byte-identical
 *              to the original.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
export const MIN_PER_DOC_DOK_ID_HITS = 1;
 
/**
 * Find every per-document subsection (`### HD12345` produced by the
 * aggregator beneath the `## Per-document intelligence` header) and
 * return each one's body. The body must contain at least one
 * `dok_id` reference so the article remains traceable to a primary
 * source — orphan per-document sections are blocked.
 */
export function extractPerDocumentSections(article: string): Array<{ id: string; body: string }> {
  const start = article.match(/^##\s+Per-document intelligence\s*$/m);
  Iif (!start || start.index === undefined) return [];
  const tail = article.slice(start.index + start[0].length);
  const stop = tail.search(/^##\s+\S/m);
  const region = stop === -1 ? tail : tail.slice(0, stop);
  const sections: Array<{ id: string; body: string }> = [];
 
  const DOK_ID_HEADING = /^###\s+(H[A-Z0-9]{6,10}|[A-ZÅÄÖ]{1,4}\d{4,8})\s*$/m;
  let cursor = region;
  let m = cursor.match(DOK_ID_HEADING);
  while (m && m.index !== undefined) {
    const id = m[1]!;
    const after = cursor.slice(m.index + m[0].length);
    const next = after.match(DOK_ID_HEADING);
    const body = next && next.index !== undefined ? after.slice(0, next.index) : after;
    sections.push({ id, body });
    if (!next || next.index === undefined) break;
    cursor = after.slice(next.index);
    m = cursor.match(DOK_ID_HEADING);
  }
  return sections;
}
 
/** Regex matching a dok_id token. Exported because the orchestrator counts hits per section body. */
export const DOK_ID_TOKEN_RE = /\b(?:H[A-Z0-9]{6,10}|[A-ZÅÄÖ]{1,4}\d{4,8})\b/;
 
import type { ArticleViolation } from '../types.js';
 
/** Per-document `dok_id` citation rule. */
export function checkPerDocument(rel: string, text: string): ArticleViolation[] {
  const out: ArticleViolation[] = [];
  for (const section of extractPerDocumentSections(text)) {
    const hits = section.body.match(new RegExp(DOK_ID_TOKEN_RE.source, 'g')) ?? [];
    Eif (hits.length < MIN_PER_DOC_DOK_ID_HITS) {
      out.push({
        file: rel,
        code: 'per-doc-missing-dok_id',
        message: `Per-document section "${section.id}" cites zero dok_id-style codes — minimum is ${MIN_PER_DOC_DOK_ID_HITS}. Every per-document subsection must trace back to at least one primary-source identifier (e.g. HD12345, FiU17).`,
      });
    }
  }
  return out;
}