All files / scripts catalog-downloaded-data.ts

81.01% Statements 64/79
68% Branches 34/50
85.71% Functions 6/7
81.69% Lines 58/71
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182x
182x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14x
 
14x
182x
 
182x
 
170x
170x
 
 
 
 
18x
 
18x
 
 
20x
20x
20x
 
20x
20x
3x
3x
 
1x
 
 
 
20x
 
20x
 
20x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14x
14x
 
20x
20x
20x
20x
19x
1x
 
1x
1x
1x
1x
 
 
 
 
 
14x
 
 
19x
14x
 
 
14x
2x
 
 
 
 
14x
8x
5x
1x
1x
 
 
 
14x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22x
 
22x
22x
 
 
 
22x
29x
29x
4x
 
25x
 
 
 
20x
 
 
22x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
 
 
  #!/usr/bin/env tsx
/**
 * @module catalog-downloaded-data
 * @description Scans `analysis/data/` and produces a JSON catalog of all
 * downloaded MCP data files with their metadata.  This catalog is consumed
 * by the AI agent during agentic workflows so it can perform per-file
 * political intelligence analysis.
 *
 * The output is a JSON object ("data catalog") with overall metadata and an
 * `entries` array describing each data file.
 *
 * Top-level catalog fields:
 * - `generatedAt`        – ISO 8601 timestamp when the catalog was generated
 * - `dataRoot`           – root directory that was scanned (e.g. "analysis/data")
 * - `totalFiles`         – total number of discovered data files
 * - `pendingAnalysis`    – number of files without analysis
 * - `completedAnalysis`  – number of files with existing analysis
 * - `entries`            – array of per-file catalog entries
 *
 * Each item in `entries` has:
 * - `id`            – document / record identifier (filename without `.json`)
 * - `type`          – persistence document type (e.g. "propositions", "mps")
 * - `path`          – path to the data file relative to repo root
 * - `analysisPath`  – expected path for the per-file analysis markdown
 * - `hasAnalysis`   – whether the analysis markdown already exists
 * - `sizeBytes`     – file size in bytes
 *
 * Usage:
 *   npx tsx scripts/catalog-downloaded-data.ts [--data-root <path>] [--type <type>] [--pending-only]
 *
 * Options:
 *   --data-root <path>   Override analysis/data root (default: analysis/data)
 *   --type <type>        Filter to a specific document type
 *   --pending-only       Only list files that do NOT yet have analysis
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import * as fs from 'node:fs';
import * as path from 'node:path';
import { fileURLToPath } from 'node:url';
 
/** Stable repo root derived from script location. */
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const REPO_ROOT = path.resolve(__dirname, '..');
 
/* ------------------------------------------------------------------ */
/*  Types                                                              */
/* ------------------------------------------------------------------ */
 
/** A single entry in the data catalog. */
export interface CatalogEntry {
  id: string;
  type: string;
  path: string;
  analysisPath: string;
  hasAnalysis: boolean;
  sizeBytes: number;
  meta: Record<string, unknown> | null;
}
 
/** Full catalog output. */
export interface DataCatalog {
  generatedAt: string;
  dataRoot: string;
  totalFiles: number;
  pendingAnalysis: number;
  completedAnalysis: number;
  entries: CatalogEntry[];
}
 
/* ------------------------------------------------------------------ */
/*  Catalog builder (exported for testing)                             */
/* ------------------------------------------------------------------ */
 
const DATA_SUBDIRS = [
  'documents/propositions',
  'documents/motions',
  'documents/committeeReports',
  'documents/speeches',
  'documents/questions',
  'documents/interpellations',
  'documents/votes',
  'votes',
  'events',
  'mps',
  'worldbank',
  'scb',
  'mcp-responses',
] as const;
 
/**
 * Derive a human-friendly `type` from the subdirectory path.
 * e.g. "documents/propositions" → "propositions", "mps" → "mps"
 */
function typeFromSubdir(subdir: string): string {
  const parts = subdir.split('/');
  return parts[parts.length - 1];
}
 
/**
 * Build the catalog by scanning `dataRoot`.
 *
 * @param dataRoot - absolute or relative path to `analysis/data`
 * @param filterType - optional type filter
 * @param pendingOnly - if true, only include files without analysis
 */
export function buildCatalog(
  dataRoot: string,
  filterType?: string,
  pendingOnly = false,
): DataCatalog {
  const allEntries: CatalogEntry[] = [];
 
  for (const subdir of DATA_SUBDIRS) {
    const docType = typeFromSubdir(subdir);
 
    if (filterType && docType !== filterType) continue;
 
    const dirPath = path.join(dataRoot, subdir);
    if (!fs.existsSync(dirPath)) continue;
 
    // The directory may contain date subdirectories (e.g. votes/2026-03-28/)
    // or direct JSON files.  We scan recursively but only pick up *.json
    // files that are NOT .meta.json.
    const jsonFiles = collectJsonFiles(dirPath);
 
    for (const filePath of jsonFiles) {
      // Use path relative to the type directory (sans .json) as id.
      // For flat dirs: "P1".  For nested dirs: "ind1/SE".
      const id = path.relative(dirPath, filePath).replace(/\.json$/, '').split(path.sep).join('/');
      const metaPath = filePath.replace(/\.json$/, '.meta.json');
      const analysisPath = filePath.replace(/\.json$/, '.analysis.md');
 
      let meta: Record<string, unknown> | null = null;
      if (fs.existsSync(metaPath)) {
        try {
          meta = JSON.parse(fs.readFileSync(metaPath, 'utf-8'));
        } catch {
          meta = null;
        }
      }
 
      const hasAnalysis = fs.existsSync(analysisPath);
 
      const stat = fs.statSync(filePath);
 
      allEntries.push({
        id: id,
        type: docType,
        path: path.relative(REPO_ROOT, filePath).split(path.sep).join('/'),
        analysisPath: path.relative(REPO_ROOT, analysisPath).split(path.sep).join('/'),
        hasAnalysis,
        sizeBytes: stat.size,
        meta,
      });
    }
  }
 
  // De-duplicate vote entries: when the same vote file appears in both
  // documents/votes and votes/YYYY-MM-DD, prefer the date-stamped path.
  // Only votes are scoped for dedup (the only type with two scan dirs).
  // Vote ids use the basename portion for matching since documents/votes/
  // stores files flat while votes/YYYY-MM-DD/ nests under date dirs.
  const bestByKey = new Map<string, (typeof allEntries)[number]>();
  for (const e of allEntries) {
    // For votes, use basename as dedup key (ignores date-dir nesting)
    const idPart = e.type === 'votes' ? e.id.split('/').pop()! : e.id;
    const key = `${e.type}::${idPart}`;
    const existing = bestByKey.get(key);
    if (!existing) {
      bestByKey.set(key, e);
    } else if (Ee.type === 'votes') {
      // Prefer the entry whose path contains a date directory (votes/YYYY-MM-DD)
      const existingHasDate = /votes\/\d{4}-\d{2}-\d{2}\//.test(existing.path);
      const currentHasDate = /votes\/\d{4}-\d{2}-\d{2}\//.test(e.path);
      Eif (currentHasDate && !existingHasDate) {
        bestByKey.set(key, e);
      }
      // Otherwise keep existing (first-seen or already date-stamped)
    }
    // Non-vote duplicates: keep first-seen (shouldn't occur with current DATA_SUBDIRS)
  }
  const dedupedEntries = [...bestByKey.values()];
 
  // Compute totals from the full scan (before pendingOnly filter)
  const totalCompleted = dedupedEntries.filter((e) => e.hasAnalysis).length;
  const totalPending = dedupedEntries.length - totalCompleted;
 
  // Apply pendingOnly filter after computing totals
  const entries = pendingOnly
    ? dedupedEntries.filter((e) => !e.hasAnalysis)
    : dedupedEntries;
 
  // Ensure deterministic ordering across platforms/filesystems.
  // Use simple < / > string compare (locale-independent) for stable collation.
  entries.sort((a, b) => {
    if (a.type < b.type) return -1;
    if (a.type > b.type) return 1;
    Iif (a.id < b.id) return -1;
    Eif (a.id > b.id) return 1;
    return 0;
  });
 
  return {
    generatedAt: new Date().toISOString(),
    dataRoot: path.relative(REPO_ROOT, dataRoot).split(path.sep).join('/'),
    totalFiles: dedupedEntries.length,
    pendingAnalysis: totalPending,
    completedAnalysis: totalCompleted,
    entries,
  };
}
 
/* ------------------------------------------------------------------ */
/*  Helpers                                                            */
/* ------------------------------------------------------------------ */
 
/** Recursively collect *.json files, excluding *.meta.json. */
function collectJsonFiles(dir: string): string[] {
  const results: string[] = [];
  let dirEntries: fs.Dirent[];
  try {
    dirEntries = fs.readdirSync(dir, { withFileTypes: true });
  } catch {
    return results;
  }
  for (const entry of dirEntries) {
    const full = path.join(dir, entry.name);
    if (entry.isDirectory()) {
      results.push(...collectJsonFiles(full));
    } else if (
      entry.isFile() &&
      entry.name.endsWith('.json') &&
      !entry.name.endsWith('.meta.json')
    ) {
      results.push(full);
    }
  }
  return results;
}
 
/* ------------------------------------------------------------------ */
/*  CLI entry point                                                    */
/* ------------------------------------------------------------------ */
 
function parseArgs(argv: string[]) {
  let dataRoot = path.join(REPO_ROOT, 'analysis/data');
  let filterType: string | undefined;
  let pendingOnly = false;
 
  for (let i = 2; i < argv.length; i++) {
    if (argv[i] === '--data-root' && argv[i + 1]) {
      dataRoot = path.resolve(argv[++i]);
    } else if (argv[i] === '--type' && argv[i + 1]) {
      filterType = argv[++i];
    } else if (argv[i] === '--pending-only') {
      pendingOnly = true;
    }
  }
  return { dataRoot, filterType, pendingOnly };
}
 
/* istanbul ignore next -- CLI wrapper */
function main() {
  const { dataRoot, filterType, pendingOnly } = parseArgs(process.argv);
  const catalog = buildCatalog(dataRoot, filterType, pendingOnly);
 
  console.error(
    `╔══════════════════════════════════════════════════════════════╗`,
  );
  console.error(
    `║   📋 Analysis Data Catalog                                  ║`,
  );
  console.error(
    `╚══════════════════════════════════════════════════════════════╝`,
  );
  console.error(`   📂 Data root: ${catalog.dataRoot}`);
  console.error(`   📄 Total files: ${catalog.totalFiles}`);
  console.error(`   ✅ Analyzed: ${catalog.completedAnalysis}`);
  console.error(`   ⏳ Pending: ${catalog.pendingAnalysis}`);
  console.error();
 
  // Write catalog JSON to stdout for piping
  console.log(JSON.stringify(catalog, null, 2));
}
 
// Run CLI when invoked directly
Iif (path.resolve(fileURLToPath(import.meta.url)) === path.resolve(process.argv[1] ?? '')) {
  main();
}