pdf_processor.py aktualisiert

Entwicklung zur vision capability Nutzung
2025-02-19 21:38:51 +00:00 · 2025-02-19 21:38:51 +00:00 · df4e34ce8f
commit df4e34ce8f
parent 512852f8ef
1 changed files with 62 additions and 14 deletions
--- a/pdf_processor.py
+++ b/pdf_processor.py
@ -3,25 +3,37 @@ import os
 import json
 from pathlib import Path
 import tempfile
+import base64

 class PDFProcessor:
    def __init__(self, input_dir, output_dir):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
-        
-        # Create temp directory for images
        self.temp_dir = Path(tempfile.mkdtemp())
+        self.batch_size = 5  # Number of images to process at once
+
+    def encode_image(self, image_path):
+        """Convert image to base64 for analysis"""
+        with open(image_path, 'rb') as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')

    def process_pdfs(self):
        """Process all PDFs in the input directory"""
        pdf_files = list(self.input_dir.glob('*.pdf'))
        results = []
+        current_batch = []
        
        for pdf_path in pdf_files:
            try:
-                result = self.process_single_pdf(pdf_path)
-                results.append(result)
+                batch_item = self.prepare_single_pdf(pdf_path)
+                current_batch.append(batch_item)
+                
+                # Process batch when it reaches batch_size
+                if len(current_batch) >= self.batch_size:
+                    self.process_batch(current_batch, results)
+                    current_batch = []
+                    
            except Exception as e:
                print(f"Error processing {pdf_path}: {str(e)}")
                results.append({
@ -30,15 +42,19 @@ class PDFProcessor:
                    'error': str(e)
                })
        
-        # Save results to JSON
-        with open(self.output_dir / 'processing_results.json', 'w') as f:
-            json.dump(results, f, indent=4)
+        # Process remaining files in the last batch
+        if current_batch:
+            self.process_batch(current_batch, results)
+        
+        # Save final results to JSON
+        with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f:
+            json.dump(results, f, indent=4, ensure_ascii=False)
        
        return results

-    def process_single_pdf(self, pdf_path):
-        """Process a single PDF file"""
-        print(f"Processing: {pdf_path}")
+    def prepare_single_pdf(self, pdf_path):
+        """Prepare a single PDF file for analysis"""
+        print(f"Preparing: {pdf_path}")
        
        # Convert first page to image
        images = convert_from_path(pdf_path, first_page=1, last_page=1)
@ -50,14 +66,46 @@ class PDFProcessor:
        image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
        first_page.save(str(image_path), 'JPEG')
        
-        # TODO: This is where we'll integrate with the vision analysis
-        # For now, we'll just return the paths
        return {
            'pdf_path': str(pdf_path),
-            'image_path': str(image_path),
-            'status': 'image_extracted'
+            'image_path': str(image_path)
        }

+    def process_batch(self, batch_items, results):
+        """Process a batch of prepared PDFs"""
+        print(f"\nProcessing batch of {len(batch_items)} files...")
+        
+        # Here you would interact with me (Claude) to analyze the images
+        # For each image in the batch:
+        for item in batch_items:
+            image_path = item['image_path']
+            pdf_path = item['pdf_path']
+            
+            # Convert image to base64
+            image_data = self.encode_image(image_path)
+            
+            # You would need to ask me to analyze this image
+            # For now, we'll save placeholder metadata
+            metadata = {
+                'title': None,
+                'date': None,
+                'publisher': None,
+                'issue_number': None,
+                'confidence': 'pending_analysis'
+            }
+            
+            results.append({
+                'pdf_path': pdf_path,
+                'image_path': str(image_path),
+                'metadata': metadata,
+                'status': 'pending_analysis'
+            })
+
+    def save_metadata(self, results):
+        """Save the extracted metadata back to PDFs or to a database"""
+        # TODO: Implement metadata saving functionality
+        pass
+
 def main():
    # Example usage
    input_dir = "path/to/pdfs"