from pdf2image import convert_from_path import os import json from pathlib import Path import tempfile import base64 import anthropic from typing import List, Dict import time class PDFProcessor: def __init__(self, input_dir: str, output_dir: str, api_key: str): self.input_dir = Path(input_dir) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.temp_dir = Path(tempfile.mkdtemp()) self.client = anthropic.Client(api_key=api_key) def encode_image(self, image_path: str) -> str: """Convert image to base64 for API""" with open(image_path, 'rb') as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def analyze_image(self, image_path: str) -> Dict: """Analyze a single image using Claude Vision API""" try: with open(image_path, 'rb') as img: message = self.client.messages.create( model="claude-3-opus-20240229", max_tokens=1000, messages=[{ "role": "user", "content": [ { "type": "text", "text": """Analyze this magazine cover and extract the following metadata: 1. Magazine Title 2. Issue Date/Publication Date 3. Publisher 4. Issue Number Format your response as JSON with these exact keys: { "title": string, "date": string, "publisher": string, "issue_number": string, "confidence": "high|medium|low" } If any field cannot be determined, use null. Set confidence based on how clear the information is.""" }, { "type": "image", "source": { "type": "base64", "media_type": "image/jpeg", "data": self.encode_image(image_path) } } ] }] ) # Parse the JSON response response_text = message.content[0].text metadata = json.loads(response_text) return metadata except Exception as e: print(f"Error analyzing image {image_path}: {str(e)}") return { "title": None, "date": None, "publisher": None, "issue_number": None, "confidence": "error" } def process_pdfs(self) -> List[Dict]: """Process all PDFs in the input directory""" pdf_files = list(self.input_dir.glob('*.pdf')) results = [] for pdf_path in pdf_files: try: result = self.process_single_pdf(pdf_path) results.append(result) # Small delay to respect API rate limits time.sleep(1) except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") results.append({ 'pdf_path': str(pdf_path), 'status': 'error', 'error': str(e) }) # Save results to JSON with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f: json.dump(results, f, indent=4, ensure_ascii=False) return results def process_single_pdf(self, pdf_path: Path) -> Dict: """Process a single PDF file""" print(f"Processing: {pdf_path}") # Convert first page to image images = convert_from_path(pdf_path, first_page=1, last_page=1) if not images: raise Exception("Could not extract first page") # Save first page image first_page = images[0] image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" first_page.save(str(image_path), 'JPEG') # Analyze the image metadata = self.analyze_image(str(image_path)) return { 'pdf_path': str(pdf_path), 'image_path': str(image_path), 'metadata': metadata, 'status': 'completed' } def main(): # Get API key from environment variable api_key = os.getenv('ANTHROPIC_API_KEY') if not api_key: raise ValueError("ANTHROPIC_API_KEY environment variable not set") input_dir = "path/to/pdfs" output_dir = "path/to/output" processor = PDFProcessor(input_dir, output_dir, api_key) results = processor.process_pdfs() print(f"\nProcessed {len(results)} PDF files") print(f"Results saved to: {processor.output_dir}/processing_results.json") if __name__ == "__main__": main()