pdf_processor.py aktualisiert

Including Antrhopic API
2025-02-19 21:42:34 +00:00 · 2025-02-19 21:42:34 +00:00 · d611992eed
commit d611992eed
parent 2a3e445c23
1 changed files with 85 additions and 61 deletions
--- a/pdf_processor.py
+++ b/pdf_processor.py
@ -4,36 +4,90 @@ import json
 from pathlib import Path
 import tempfile
 import base64
+import anthropic
+from typing import List, Dict
+import time

 class PDFProcessor:
-    def __init__(self, input_dir, output_dir):
+    def __init__(self, input_dir: str, output_dir: str, api_key: str):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.temp_dir = Path(tempfile.mkdtemp())
-        self.batch_size = 5  # Number of images to process at once
+        self.client = anthropic.Client(api_key=api_key)
        
-    def encode_image(self, image_path):
-        """Convert image to base64 for analysis"""
+    def encode_image(self, image_path: str) -> str:
+        """Convert image to base64 for API"""
        with open(image_path, 'rb') as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

-    def process_pdfs(self):
+    def analyze_image(self, image_path: str) -> Dict:
+        """Analyze a single image using Claude Vision API"""
+        try:
+            with open(image_path, 'rb') as img:
+                message = self.client.messages.create(
+                    model="claude-3-opus-20240229",
+                    max_tokens=1000,
+                    messages=[{
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": """Analyze this magazine cover and extract the following metadata:
+                                1. Magazine Title
+                                2. Issue Date/Publication Date
+                                3. Publisher
+                                4. Issue Number
+                                
+                                Format your response as JSON with these exact keys:
+                                {
+                                    "title": string,
+                                    "date": string,
+                                    "publisher": string,
+                                    "issue_number": string,
+                                    "confidence": "high|medium|low"
+                                }
+                                
+                                If any field cannot be determined, use null. Set confidence based on how clear the information is."""
+                            },
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/jpeg",
+                                    "data": self.encode_image(image_path)
+                                }
+                            }
+                        ]
+                    }]
+                )
+                
+                # Parse the JSON response
+                response_text = message.content[0].text
+                metadata = json.loads(response_text)
+                return metadata
+
+        except Exception as e:
+            print(f"Error analyzing image {image_path}: {str(e)}")
+            return {
+                "title": None,
+                "date": None,
+                "publisher": None,
+                "issue_number": None,
+                "confidence": "error"
+            }
+
+    def process_pdfs(self) -> List[Dict]:
        """Process all PDFs in the input directory"""
        pdf_files = list(self.input_dir.glob('*.pdf'))
        results = []
-        current_batch = []
        
        for pdf_path in pdf_files:
            try:
-                batch_item = self.prepare_single_pdf(pdf_path)
-                current_batch.append(batch_item)
-                
-                # Process batch when it reaches batch_size
-                if len(current_batch) >= self.batch_size:
-                    self.process_batch(current_batch, results)
-                    current_batch = []
-                    
+                result = self.process_single_pdf(pdf_path)
+                results.append(result)
+                # Small delay to respect API rate limits
+                time.sleep(1)
            except Exception as e:
                print(f"Error processing {pdf_path}: {str(e)}")
                results.append({
@ -42,19 +96,15 @@ class PDFProcessor:
                    'error': str(e)
                })
        
-        # Process remaining files in the last batch
-        if current_batch:
-            self.process_batch(current_batch, results)
-        
-        # Save final results to JSON
+        # Save results to JSON
        with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=4, ensure_ascii=False)
        
        return results

-    def prepare_single_pdf(self, pdf_path):
-        """Prepare a single PDF file for analysis"""
-        print(f"Preparing: {pdf_path}")
+    def process_single_pdf(self, pdf_path: Path) -> Dict:
+        """Process a single PDF file"""
+        print(f"Processing: {pdf_path}")
        
        # Convert first page to image
        images = convert_from_path(pdf_path, first_page=1, last_page=1)
@ -66,52 +116,26 @@ class PDFProcessor:
        image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
        first_page.save(str(image_path), 'JPEG')
        
+        # Analyze the image
+        metadata = self.analyze_image(str(image_path))
+        
        return {
            'pdf_path': str(pdf_path),
-            'image_path': str(image_path)
+            'image_path': str(image_path),
+            'metadata': metadata,
+            'status': 'completed'
        }

-    def process_batch(self, batch_items, results):
-        """Process a batch of prepared PDFs"""
-        print(f"\nProcessing batch of {len(batch_items)} files...")
-        
-        # Here you would interact with me (Claude) to analyze the images
-        # For each image in the batch:
-        for item in batch_items:
-            image_path = item['image_path']
-            pdf_path = item['pdf_path']
-            
-            # Convert image to base64
-            image_data = self.encode_image(image_path)
-            
-            # You would need to ask me to analyze this image
-            # For now, we'll save placeholder metadata
-            metadata = {
-                'title': None,
-                'date': None,
-                'publisher': None,
-                'issue_number': None,
-                'confidence': 'pending_analysis'
-            }
-            
-            results.append({
-                'pdf_path': pdf_path,
-                'image_path': str(image_path),
-                'metadata': metadata,
-                'status': 'pending_analysis'
-            })
-
-    def save_metadata(self, results):
-        """Save the extracted metadata back to PDFs or to a database"""
-        # TODO: Implement metadata saving functionality
-        pass
-
 def main():
-    # Example usage
+    # Get API key from environment variable
+    api_key = os.getenv('ANTHROPIC_API_KEY')
+    if not api_key:
+        raise ValueError("ANTHROPIC_API_KEY environment variable not set")
+    
    input_dir = "path/to/pdfs"
    output_dir = "path/to/output"
    
-    processor = PDFProcessor(input_dir, output_dir)
+    processor = PDFProcessor(input_dir, output_dir, api_key)
    results = processor.process_pdfs()
    
    print(f"\nProcessed {len(results)} PDF files")