from pdf2image import convert_from_path import os import json from pathlib import Path import tempfile class PDFProcessor: def __init__(self, input_dir, output_dir): self.input_dir = Path(input_dir) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) # Create temp directory for images self.temp_dir = Path(tempfile.mkdtemp()) def process_pdfs(self): """Process all PDFs in the input directory""" pdf_files = list(self.input_dir.glob('*.pdf')) results = [] for pdf_path in pdf_files: try: result = self.process_single_pdf(pdf_path) results.append(result) except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") results.append({ 'pdf_path': str(pdf_path), 'status': 'error', 'error': str(e) }) # Save results to JSON with open(self.output_dir / 'processing_results.json', 'w') as f: json.dump(results, f, indent=4) return results def process_single_pdf(self, pdf_path): """Process a single PDF file""" print(f"Processing: {pdf_path}") # Convert first page to image images = convert_from_path(pdf_path, first_page=1, last_page=1) if not images: raise Exception("Could not extract first page") # Save first page image first_page = images[0] image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" first_page.save(str(image_path), 'JPEG') # TODO: This is where we'll integrate with the vision analysis # For now, we'll just return the paths return { 'pdf_path': str(pdf_path), 'image_path': str(image_path), 'status': 'image_extracted' } def main(): # Example usage input_dir = "path/to/pdfs" output_dir = "path/to/output" processor = PDFProcessor(input_dir, output_dir) results = processor.process_pdfs() print(f"\nProcessed {len(results)} PDF files") print(f"Results saved to: {processor.output_dir}/processing_results.json") if __name__ == "__main__": main()