diff --git a/pdf_processor.py b/pdf_processor.py index 2871bbf..325d0a5 100644 --- a/pdf_processor.py +++ b/pdf_processor.py @@ -3,25 +3,37 @@ import os import json from pathlib import Path import tempfile +import base64 class PDFProcessor: def __init__(self, input_dir, output_dir): self.input_dir = Path(input_dir) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) - - # Create temp directory for images self.temp_dir = Path(tempfile.mkdtemp()) + self.batch_size = 5 # Number of images to process at once + + def encode_image(self, image_path): + """Convert image to base64 for analysis""" + with open(image_path, 'rb') as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') def process_pdfs(self): """Process all PDFs in the input directory""" pdf_files = list(self.input_dir.glob('*.pdf')) results = [] + current_batch = [] for pdf_path in pdf_files: try: - result = self.process_single_pdf(pdf_path) - results.append(result) + batch_item = self.prepare_single_pdf(pdf_path) + current_batch.append(batch_item) + + # Process batch when it reaches batch_size + if len(current_batch) >= self.batch_size: + self.process_batch(current_batch, results) + current_batch = [] + except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") results.append({ @@ -30,15 +42,19 @@ class PDFProcessor: 'error': str(e) }) - # Save results to JSON - with open(self.output_dir / 'processing_results.json', 'w') as f: - json.dump(results, f, indent=4) + # Process remaining files in the last batch + if current_batch: + self.process_batch(current_batch, results) + + # Save final results to JSON + with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f: + json.dump(results, f, indent=4, ensure_ascii=False) return results - def process_single_pdf(self, pdf_path): - """Process a single PDF file""" - print(f"Processing: {pdf_path}") + def prepare_single_pdf(self, pdf_path): + """Prepare a single PDF file for analysis""" + print(f"Preparing: {pdf_path}") # Convert first page to image images = convert_from_path(pdf_path, first_page=1, last_page=1) @@ -50,14 +66,46 @@ class PDFProcessor: image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" first_page.save(str(image_path), 'JPEG') - # TODO: This is where we'll integrate with the vision analysis - # For now, we'll just return the paths return { 'pdf_path': str(pdf_path), - 'image_path': str(image_path), - 'status': 'image_extracted' + 'image_path': str(image_path) } + def process_batch(self, batch_items, results): + """Process a batch of prepared PDFs""" + print(f"\nProcessing batch of {len(batch_items)} files...") + + # Here you would interact with me (Claude) to analyze the images + # For each image in the batch: + for item in batch_items: + image_path = item['image_path'] + pdf_path = item['pdf_path'] + + # Convert image to base64 + image_data = self.encode_image(image_path) + + # You would need to ask me to analyze this image + # For now, we'll save placeholder metadata + metadata = { + 'title': None, + 'date': None, + 'publisher': None, + 'issue_number': None, + 'confidence': 'pending_analysis' + } + + results.append({ + 'pdf_path': pdf_path, + 'image_path': str(image_path), + 'metadata': metadata, + 'status': 'pending_analysis' + }) + + def save_metadata(self, results): + """Save the extracted metadata back to PDFs or to a database""" + # TODO: Implement metadata saving functionality + pass + def main(): # Example usage input_dir = "path/to/pdfs"