pre-article-analysis pdf-converter.ts

65.11% Statements 28/43
69.56% Branches 16/23
60% Functions 3/5
65% Lines 26/40
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
 
 
 
 
 
 
3x
2x
2x
2x
 
2x
 
2x
 
 
 
 
 
 
2x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1x
 
1x
 
 
 
 
 
 
 
 
 
7x
 
6x
6x
 
6x
22x
 
 
22x
8x
3x
 
8x
 
 
 
14x
 
 
 
 
 
4x
4x
4x
4x
 
 
10x
 
 
6x
 
  /**
 * @module pre-article-analysis/pdf-converter
 * @description Converts binary document formats (PDF) to text or markdown.
 *
 * Uses system tools when available:
 *   1. `pdftotext` (poppler-utils) — preferred, preserves layout
 *   2. Returns an error with install instructions when no system tools are found
 *
 * This module only returns converted text — callers are responsible for
 * persisting the output (e.g. as `.txt` or `.md` files).
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { execFileSync } from 'node:child_process';
import crypto from 'node:crypto';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
 
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
 
export interface ConversionResult {
  /** Whether conversion succeeded. */
  success: boolean;
  /** Converted text content (empty on failure). */
  text: string;
  /** Tool used for conversion. */
  tool: 'pdftotext' | 'none';
  /** Error message if conversion failed. */
  error?: string;
}
 
// ---------------------------------------------------------------------------
// Tool detection
// ---------------------------------------------------------------------------
 
let _pdftotextAvailable: boolean | null = null;
 
/**
 * Check if `pdftotext` (from poppler-utils) is available on the system.
 * Caches the result after first check.
 */
export function isPdfToTextAvailable(): boolean {
  if (_pdftotextAvailable !== null) return _pdftotextAvailable;
  try {
    execFileSync('pdftotext', ['-v'], { stdio: 'pipe', timeout: 5000 });
    _pdftotextAvailable = true;
  } catch {
    _pdftotextAvailable = false;
  }
  return _pdftotextAvailable;
}
 
/**
 * Reset the cached availability check (for testing).
 */
export function resetPdfToolCache(): void {
  _pdftotextAvailable = null;
}
 
// ---------------------------------------------------------------------------
// Conversion
// ---------------------------------------------------------------------------
 
/**
 * Convert a PDF file to text using the best available tool.
 *
 * @param pdfPath - Absolute path to the PDF file.
 * @returns Conversion result with text content.
 */
export function convertPdfToText(pdfPath: string): ConversionResult {
  if (!fs.existsSync(pdfPath)) {
    return { success: false, text: '', tool: 'none', error: `File not found: ${pdfPath}` };
  }
 
  // Try pdftotext first
  if (isPdfToTextAvailable()) {
    try {
      const text = execFileSync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
        encoding: 'utf8',
        timeout: 30_000,
        maxBuffer: 10 * 1024 * 1024, // 10 MB
      });
      return { success: true, text: text.trim(), tool: 'pdftotext' };
    } catch (err) {
      const msg = err instanceof Error ? err.message : String(err);
      return { success: false, text: '', tool: 'pdftotext', error: `pdftotext failed: ${msg}` };
    }
  }
 
  // No system tool available
  return {
    success: false,
    text: '',
    tool: 'none',
    error: 'No PDF conversion tool available. Install poppler-utils: apt-get install poppler-utils',
  };
}
 
/**
 * Convert a PDF buffer (in-memory) to text.
 * Writes to a temp file, converts, then cleans up.
 *
 * @param pdfBuffer - PDF content as a Buffer.
 * @param tempDir   - Directory for temporary file storage.
 * @returns Conversion result with text content.
 */
export function convertPdfBufferToText(
  pdfBuffer: Buffer,
  tempDir: string = os.tmpdir(),
): ConversionResult {
  const tempFile = path.join(tempDir, `riksdag-pdf-${crypto.randomUUID()}.pdf`);
  try {
    fs.writeFileSync(tempFile, pdfBuffer);
    return convertPdfToText(tempFile);
  } finally {
    try { fs.unlinkSync(tempFile); } catch { /* temp file cleanup is best-effort */ }
  }
}
 
/** Minimum character length for a line to be considered a heading candidate. */
const MIN_HEADING_LENGTH = 3;
/** Maximum character length for a heading line (longer lines are likely paragraphs). */
const MAX_HEADING_LENGTH = 120;
 
/**
 * Convert PDF text output to a simple markdown format.
 * Applies basic heuristics:
 *   - Lines that look like headings (ALL CAPS, short) become ## headings
 *   - Preserves paragraph breaks
 *   - Normalises whitespace
 */
export function textToMarkdown(text: string): string {
  if (!text) return '';
 
  const lines = text.split('\n');
  const result: string[] = [];
 
  for (const line of lines) {
    const trimmed = line.trim();
 
    // Skip empty lines (preserve paragraph breaks)
    if (!trimmed) {
      if (result.length > 0 && result[result.length - 1] !== '') {
        result.push('');
      }
      continue;
    }
 
    // Heuristic: short ALL CAPS lines are likely headings in Swedish parliamentary PDFs
    if (
      trimmed.length > MIN_HEADING_LENGTH &&
      trimmed.length < MAX_HEADING_LENGTH &&
      trimmed === trimmed.toUpperCase() &&
      /[A-ZÅÄÖ]/.test(trimmed)
    ) {
      result.push('');
      result.push(`## ${trimmed}`);
      result.push('');
      continue;
    }
 
    result.push(trimmed);
  }
 
  return result.join('\n').trim();
}