All files / scripts/validators/article/rules per-document.ts

96.55% Statements 28/29
90% Branches 18/20
100% Functions 2/2
100% Lines 27/27

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69                              2x                   2x 2x 2x 2x 2x 2x   2x 2x 2x 2x 3x 3x 3x 3x 3x 3x 1x 1x   2x       2x           1x 1x 1x 1x 1x             1x    
/**
 * @module scripts/validators/article/rules/per-document
 * @description Per-document subsection extractor + minimum dok_id
 *              citation threshold.
 *
 *              Rule census: extracted from
 *              `scripts/validate-article.ts` lines 108
 *              (`MIN_PER_DOC_DOK_ID_HITS`) and 187–209
 *              (`extractPerDocumentSections`). Logic is byte-identical
 *              to the original.
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
export const MIN_PER_DOC_DOK_ID_HITS = 1;
 
/**
 * Find every per-document subsection (`### HD12345` produced by the
 * aggregator beneath the `## Per-document intelligence` header) and
 * return each one's body. The body must contain at least one
 * `dok_id` reference so the article remains traceable to a primary
 * source — orphan per-document sections are blocked.
 */
export function extractPerDocumentSections(article: string): Array<{ id: string; body: string }> {
  const start = article.match(/^##\s+Per-document intelligence\s*$/m);
  Iif (!start || start.index === undefined) return [];
  const tail = article.slice(start.index + start[0].length);
  const stop = tail.search(/^##\s+\S/m);
  const region = stop === -1 ? tail : tail.slice(0, stop);
  const sections: Array<{ id: string; body: string }> = [];
 
  const DOK_ID_HEADING = /^###\s+(H[A-Z0-9]{6,10}|[A-ZÅÄÖ]{1,4}\d{4,8})\s*$/m;
  let cursor = region;
  let m = cursor.match(DOK_ID_HEADING);
  while (m && m.index !== undefined) {
    const id = m[1]!;
    const after = cursor.slice(m.index + m[0].length);
    const next = after.match(DOK_ID_HEADING);
    const body = next && next.index !== undefined ? after.slice(0, next.index) : after;
    sections.push({ id, body });
    if (!next || next.index === undefined) break;
    cursor = after.slice(next.index);
    m = cursor.match(DOK_ID_HEADING);
  }
  return sections;
}
 
/** Regex matching a dok_id token. Exported because the orchestrator counts hits per section body. */
export const DOK_ID_TOKEN_RE = /\b(?:H[A-Z0-9]{6,10}|[A-ZÅÄÖ]{1,4}\d{4,8})\b/;
 
import type { ArticleViolation } from '../types.js';
 
/** Per-document `dok_id` citation rule. */
export function checkPerDocument(rel: string, text: string): ArticleViolation[] {
  const out: ArticleViolation[] = [];
  for (const section of extractPerDocumentSections(text)) {
    const hits = section.body.match(new RegExp(DOK_ID_TOKEN_RE.source, 'g')) ?? [];
    Eif (hits.length < MIN_PER_DOC_DOK_ID_HITS) {
      out.push({
        file: rel,
        code: 'per-doc-missing-dok_id',
        message: `Per-document section "${section.id}" cites zero dok_id-style codes — minimum is ${MIN_PER_DOC_DOK_ID_HITS}. Every per-document subsection must trace back to at least one primary-source identifier (e.g. HD12345, FiU17).`,
      });
    }
  }
  return out;
}