From d611992eed57089db9c9c7f3fcfcb7b7e5eb7286 Mon Sep 17 00:00:00 2001 From: sebastian Date: Wed, 19 Feb 2025 21:42:34 +0000 Subject: [PATCH] pdf_processor.py aktualisiert Including Antrhopic API --- pdf_processor.py | 146 +++++++++++++++++++++++++++-------------------- 1 file changed, 85 insertions(+), 61 deletions(-) diff --git a/pdf_processor.py b/pdf_processor.py index 325d0a5..e936e80 100644 --- a/pdf_processor.py +++ b/pdf_processor.py @@ -4,36 +4,90 @@ import json from pathlib import Path import tempfile import base64 +import anthropic +from typing import List, Dict +import time class PDFProcessor: - def __init__(self, input_dir, output_dir): + def __init__(self, input_dir: str, output_dir: str, api_key: str): self.input_dir = Path(input_dir) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.temp_dir = Path(tempfile.mkdtemp()) - self.batch_size = 5 # Number of images to process at once - - def encode_image(self, image_path): - """Convert image to base64 for analysis""" + self.client = anthropic.Client(api_key=api_key) + + def encode_image(self, image_path: str) -> str: + """Convert image to base64 for API""" with open(image_path, 'rb') as image_file: return base64.b64encode(image_file.read()).decode('utf-8') - def process_pdfs(self): + def analyze_image(self, image_path: str) -> Dict: + """Analyze a single image using Claude Vision API""" + try: + with open(image_path, 'rb') as img: + message = self.client.messages.create( + model="claude-3-opus-20240229", + max_tokens=1000, + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": """Analyze this magazine cover and extract the following metadata: + 1. Magazine Title + 2. Issue Date/Publication Date + 3. Publisher + 4. Issue Number + + Format your response as JSON with these exact keys: + { + "title": string, + "date": string, + "publisher": string, + "issue_number": string, + "confidence": "high|medium|low" + } + + If any field cannot be determined, use null. Set confidence based on how clear the information is.""" + }, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": self.encode_image(image_path) + } + } + ] + }] + ) + + # Parse the JSON response + response_text = message.content[0].text + metadata = json.loads(response_text) + return metadata + + except Exception as e: + print(f"Error analyzing image {image_path}: {str(e)}") + return { + "title": None, + "date": None, + "publisher": None, + "issue_number": None, + "confidence": "error" + } + + def process_pdfs(self) -> List[Dict]: """Process all PDFs in the input directory""" pdf_files = list(self.input_dir.glob('*.pdf')) results = [] - current_batch = [] for pdf_path in pdf_files: try: - batch_item = self.prepare_single_pdf(pdf_path) - current_batch.append(batch_item) - - # Process batch when it reaches batch_size - if len(current_batch) >= self.batch_size: - self.process_batch(current_batch, results) - current_batch = [] - + result = self.process_single_pdf(pdf_path) + results.append(result) + # Small delay to respect API rate limits + time.sleep(1) except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") results.append({ @@ -42,19 +96,15 @@ class PDFProcessor: 'error': str(e) }) - # Process remaining files in the last batch - if current_batch: - self.process_batch(current_batch, results) - - # Save final results to JSON + # Save results to JSON with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f: json.dump(results, f, indent=4, ensure_ascii=False) return results - def prepare_single_pdf(self, pdf_path): - """Prepare a single PDF file for analysis""" - print(f"Preparing: {pdf_path}") + def process_single_pdf(self, pdf_path: Path) -> Dict: + """Process a single PDF file""" + print(f"Processing: {pdf_path}") # Convert first page to image images = convert_from_path(pdf_path, first_page=1, last_page=1) @@ -66,52 +116,26 @@ class PDFProcessor: image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" first_page.save(str(image_path), 'JPEG') + # Analyze the image + metadata = self.analyze_image(str(image_path)) + return { 'pdf_path': str(pdf_path), - 'image_path': str(image_path) + 'image_path': str(image_path), + 'metadata': metadata, + 'status': 'completed' } - def process_batch(self, batch_items, results): - """Process a batch of prepared PDFs""" - print(f"\nProcessing batch of {len(batch_items)} files...") - - # Here you would interact with me (Claude) to analyze the images - # For each image in the batch: - for item in batch_items: - image_path = item['image_path'] - pdf_path = item['pdf_path'] - - # Convert image to base64 - image_data = self.encode_image(image_path) - - # You would need to ask me to analyze this image - # For now, we'll save placeholder metadata - metadata = { - 'title': None, - 'date': None, - 'publisher': None, - 'issue_number': None, - 'confidence': 'pending_analysis' - } - - results.append({ - 'pdf_path': pdf_path, - 'image_path': str(image_path), - 'metadata': metadata, - 'status': 'pending_analysis' - }) - - def save_metadata(self, results): - """Save the extracted metadata back to PDFs or to a database""" - # TODO: Implement metadata saving functionality - pass - def main(): - # Example usage + # Get API key from environment variable + api_key = os.getenv('ANTHROPIC_API_KEY') + if not api_key: + raise ValueError("ANTHROPIC_API_KEY environment variable not set") + input_dir = "path/to/pdfs" output_dir = "path/to/output" - processor = PDFProcessor(input_dir, output_dir) + processor = PDFProcessor(input_dir, output_dir, api_key) results = processor.process_pdfs() print(f"\nProcessed {len(results)} PDF files")