Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | 1x 1x 3x 2x 2x 2x 2x 2x 2x 1x 1x 7x 6x 6x 6x 22x 22x 8x 3x 8x 14x 4x 4x 4x 4x 10x 6x | /**
* @module pre-article-analysis/pdf-converter
* @description Converts binary document formats (PDF) to text or markdown.
*
* Uses system tools when available:
* 1. `pdftotext` (poppler-utils) — preferred, preserves layout
* 2. Returns an error with install instructions when no system tools are found
*
* This module only returns converted text — callers are responsible for
* persisting the output (e.g. as `.txt` or `.md` files).
*
* @author Hack23 AB
* @license Apache-2.0
*/
import { execFileSync } from 'node:child_process';
import crypto from 'node:crypto';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface ConversionResult {
/** Whether conversion succeeded. */
success: boolean;
/** Converted text content (empty on failure). */
text: string;
/** Tool used for conversion. */
tool: 'pdftotext' | 'none';
/** Error message if conversion failed. */
error?: string;
}
// ---------------------------------------------------------------------------
// Tool detection
// ---------------------------------------------------------------------------
let _pdftotextAvailable: boolean | null = null;
/**
* Check if `pdftotext` (from poppler-utils) is available on the system.
* Caches the result after first check.
*/
export function isPdfToTextAvailable(): boolean {
if (_pdftotextAvailable !== null) return _pdftotextAvailable;
try {
execFileSync('pdftotext', ['-v'], { stdio: 'pipe', timeout: 5000 });
_pdftotextAvailable = true;
} catch {
_pdftotextAvailable = false;
}
return _pdftotextAvailable;
}
/**
* Reset the cached availability check (for testing).
*/
export function resetPdfToolCache(): void {
_pdftotextAvailable = null;
}
// ---------------------------------------------------------------------------
// Conversion
// ---------------------------------------------------------------------------
/**
* Convert a PDF file to text using the best available tool.
*
* @param pdfPath - Absolute path to the PDF file.
* @returns Conversion result with text content.
*/
export function convertPdfToText(pdfPath: string): ConversionResult {
if (!fs.existsSync(pdfPath)) {
return { success: false, text: '', tool: 'none', error: `File not found: ${pdfPath}` };
}
// Try pdftotext first
if (isPdfToTextAvailable()) {
try {
const text = execFileSync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
encoding: 'utf8',
timeout: 30_000,
maxBuffer: 10 * 1024 * 1024, // 10 MB
});
return { success: true, text: text.trim(), tool: 'pdftotext' };
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return { success: false, text: '', tool: 'pdftotext', error: `pdftotext failed: ${msg}` };
}
}
// No system tool available
return {
success: false,
text: '',
tool: 'none',
error: 'No PDF conversion tool available. Install poppler-utils: apt-get install poppler-utils',
};
}
/**
* Convert a PDF buffer (in-memory) to text.
* Writes to a temp file, converts, then cleans up.
*
* @param pdfBuffer - PDF content as a Buffer.
* @param tempDir - Directory for temporary file storage.
* @returns Conversion result with text content.
*/
export function convertPdfBufferToText(
pdfBuffer: Buffer,
tempDir: string = os.tmpdir(),
): ConversionResult {
const tempFile = path.join(tempDir, `riksdag-pdf-${crypto.randomUUID()}.pdf`);
try {
fs.writeFileSync(tempFile, pdfBuffer);
return convertPdfToText(tempFile);
} finally {
try { fs.unlinkSync(tempFile); } catch { /* temp file cleanup is best-effort */ }
}
}
/** Minimum character length for a line to be considered a heading candidate. */
const MIN_HEADING_LENGTH = 3;
/** Maximum character length for a heading line (longer lines are likely paragraphs). */
const MAX_HEADING_LENGTH = 120;
/**
* Convert PDF text output to a simple markdown format.
* Applies basic heuristics:
* - Lines that look like headings (ALL CAPS, short) become ## headings
* - Preserves paragraph breaks
* - Normalises whitespace
*/
export function textToMarkdown(text: string): string {
if (!text) return '';
const lines = text.split('\n');
const result: string[] = [];
for (const line of lines) {
const trimmed = line.trim();
// Skip empty lines (preserve paragraph breaks)
if (!trimmed) {
if (result.length > 0 && result[result.length - 1] !== '') {
result.push('');
}
continue;
}
// Heuristic: short ALL CAPS lines are likely headings in Swedish parliamentary PDFs
if (
trimmed.length > MIN_HEADING_LENGTH &&
trimmed.length < MAX_HEADING_LENGTH &&
trimmed === trimmed.toUpperCase() &&
/[A-ZÅÄÖ]/.test(trimmed)
) {
result.push('');
result.push(`## ${trimmed}`);
result.push('');
continue;
}
result.push(trimmed);
}
return result.join('\n').trim();
}
|