from pdf2image import convert_from_path import os import json from pathlib import Path import tempfile import base64 class PDFProcessor: def __init__(self, input_dir, output_dir): self.input_dir = Path(input_dir) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.temp_dir = Path(tempfile.mkdtemp()) self.batch_size = 5 # Number of images to process at once def encode_image(self, image_path): """Convert image to base64 for analysis""" with open(image_path, 'rb') as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def process_pdfs(self): """Process all PDFs in the input directory""" pdf_files = list(self.input_dir.glob('*.pdf')) results = [] current_batch = [] for pdf_path in pdf_files: try: batch_item = self.prepare_single_pdf(pdf_path) current_batch.append(batch_item) # Process batch when it reaches batch_size if len(current_batch) >= self.batch_size: self.process_batch(current_batch, results) current_batch = [] except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") results.append({ 'pdf_path': str(pdf_path), 'status': 'error', 'error': str(e) }) # Process remaining files in the last batch if current_batch: self.process_batch(current_batch, results) # Save final results to JSON with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f: json.dump(results, f, indent=4, ensure_ascii=False) return results def prepare_single_pdf(self, pdf_path): """Prepare a single PDF file for analysis""" print(f"Preparing: {pdf_path}") # Convert first page to image images = convert_from_path(pdf_path, first_page=1, last_page=1) if not images: raise Exception("Could not extract first page") # Save first page image first_page = images[0] image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" first_page.save(str(image_path), 'JPEG') return { 'pdf_path': str(pdf_path), 'image_path': str(image_path) } def process_batch(self, batch_items, results): """Process a batch of prepared PDFs""" print(f"\nProcessing batch of {len(batch_items)} files...") # Here you would interact with me (Claude) to analyze the images # For each image in the batch: for item in batch_items: image_path = item['image_path'] pdf_path = item['pdf_path'] # Convert image to base64 image_data = self.encode_image(image_path) # You would need to ask me to analyze this image # For now, we'll save placeholder metadata metadata = { 'title': None, 'date': None, 'publisher': None, 'issue_number': None, 'confidence': 'pending_analysis' } results.append({ 'pdf_path': pdf_path, 'image_path': str(image_path), 'metadata': metadata, 'status': 'pending_analysis' }) def save_metadata(self, results): """Save the extracted metadata back to PDFs or to a database""" # TODO: Implement metadata saving functionality pass def main(): # Example usage input_dir = "path/to/pdfs" output_dir = "path/to/output" processor = PDFProcessor(input_dir, output_dir) results = processor.process_pdfs() print(f"\nProcessed {len(results)} PDF files") print(f"Results saved to: {processor.output_dir}/processing_results.json") if __name__ == "__main__": main()