All files / scripts/pre-article-analysis pdf-converter.ts

65.9% Statements 29/44
69.56% Branches 16/23
60% Functions 3/5
65.85% Lines 27/41

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172                              1x                                                 1x             3x 2x 2x 2x   2x   2x             2x                                                                                                                               1x   1x                   7x   6x 6x   6x 22x     22x 8x 3x   8x       14x           4x 4x 4x 4x     10x     6x    
/**
 * @module pre-article-analysis/pdf-converter
 * @description Converts binary document formats (PDF) to text or markdown.
 *
 * Uses system tools when available:
 *   1. `pdftotext` (poppler-utils) — preferred, preserves layout
 *   2. Returns an error with install instructions when no system tools are found
 *
 * This module only returns converted text — callers are responsible for
 * persisting the output (e.g. as `.txt` or `.md` files).
 *
 * @author Hack23 AB
 * @license Apache-2.0
 */
 
import { execFileSync } from 'node:child_process';
import crypto from 'node:crypto';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
 
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
 
export interface ConversionResult {
  /** Whether conversion succeeded. */
  success: boolean;
  /** Converted text content (empty on failure). */
  text: string;
  /** Tool used for conversion. */
  tool: 'pdftotext' | 'none';
  /** Error message if conversion failed. */
  error?: string;
}
 
// ---------------------------------------------------------------------------
// Tool detection
// ---------------------------------------------------------------------------
 
let _pdftotextAvailable: boolean | null = null;
 
/**
 * Check if `pdftotext` (from poppler-utils) is available on the system.
 * Caches the result after first check.
 */
export function isPdfToTextAvailable(): boolean {
  if (_pdftotextAvailable !== null) return _pdftotextAvailable;
  try {
    execFileSync('pdftotext', ['-v'], { stdio: 'pipe', timeout: 5000 });
    _pdftotextAvailable = true;
  } catch {
    _pdftotextAvailable = false;
  }
  return _pdftotextAvailable;
}
 
/**
 * Reset the cached availability check (for testing).
 */
export function resetPdfToolCache(): void {
  _pdftotextAvailable = null;
}
 
// ---------------------------------------------------------------------------
// Conversion
// ---------------------------------------------------------------------------
 
/**
 * Convert a PDF file to text using the best available tool.
 *
 * @param pdfPath - Absolute path to the PDF file.
 * @returns Conversion result with text content.
 */
export function convertPdfToText(pdfPath: string): ConversionResult {
  if (!fs.existsSync(pdfPath)) {
    return { success: false, text: '', tool: 'none', error: `File not found: ${pdfPath}` };
  }
 
  // Try pdftotext first
  if (isPdfToTextAvailable()) {
    try {
      const text = execFileSync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
        encoding: 'utf8',
        timeout: 30_000,
        maxBuffer: 10 * 1024 * 1024, // 10 MB
      });
      return { success: true, text: text.trim(), tool: 'pdftotext' };
    } catch (err) {
      const msg = err instanceof Error ? err.message : String(err);
      return { success: false, text: '', tool: 'pdftotext', error: `pdftotext failed: ${msg}` };
    }
  }
 
  // No system tool available
  return {
    success: false,
    text: '',
    tool: 'none',
    error: 'No PDF conversion tool available. Install poppler-utils: apt-get install poppler-utils',
  };
}
 
/**
 * Convert a PDF buffer (in-memory) to text.
 * Writes to a temp file, converts, then cleans up.
 *
 * @param pdfBuffer - PDF content as a Buffer.
 * @param tempDir   - Directory for temporary file storage.
 * @returns Conversion result with text content.
 */
export function convertPdfBufferToText(
  pdfBuffer: Buffer,
  tempDir: string = os.tmpdir(),
): ConversionResult {
  const tempFile = path.join(tempDir, `riksdag-pdf-${crypto.randomUUID()}.pdf`);
  try {
    fs.writeFileSync(tempFile, pdfBuffer);
    return convertPdfToText(tempFile);
  } finally {
    try { fs.unlinkSync(tempFile); } catch { /* temp file cleanup is best-effort */ }
  }
}
 
/** Minimum character length for a line to be considered a heading candidate. */
const MIN_HEADING_LENGTH = 3;
/** Maximum character length for a heading line (longer lines are likely paragraphs). */
const MAX_HEADING_LENGTH = 120;
 
/**
 * Convert PDF text output to a simple markdown format.
 * Applies basic heuristics:
 *   - Lines that look like headings (ALL CAPS, short) become ## headings
 *   - Preserves paragraph breaks
 *   - Normalises whitespace
 */
export function textToMarkdown(text: string): string {
  if (!text) return '';
 
  const lines = text.split('\n');
  const result: string[] = [];
 
  for (const line of lines) {
    const trimmed = line.trim();
 
    // Skip empty lines (preserve paragraph breaks)
    if (!trimmed) {
      if (result.length > 0 && result[result.length - 1] !== '') {
        result.push('');
      }
      continue;
    }
 
    // Heuristic: short ALL CAPS lines are likely headings in Swedish parliamentary PDFs
    if (
      trimmed.length > MIN_HEADING_LENGTH &&
      trimmed.length < MAX_HEADING_LENGTH &&
      trimmed === trimmed.toUpperCase() &&
      /[A-ZÅÄÖ]/.test(trimmed)
    ) {
      result.push('');
      result.push(`## ${trimmed}`);
      result.push('');
      continue;
    }
 
    result.push(trimmed);
  }
 
  return result.join('\n').trim();
}