diff --git a/pdf_processor.py b/pdf_processor.py index c9d4848..41138ef 100644 --- a/pdf_processor.py +++ b/pdf_processor.py @@ -9,8 +9,183 @@ from typing import List, Dict import time import argparse import sys +from tqdm import tqdm +import logging +from datetime import datetime class PDFProcessor: + def __init__(self, input_dir: str, output_dir: str, api_key: str, logger: logging.Logger = None): + self.input_dir = Path(input_dir) + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + self.temp_dir = Path(tempfile.mkdtemp()) + self.client = anthropic.Client(api_key=api_key) + self.logger = logger or self._setup_default_logger() + + def _setup_default_logger(self) -> logging.Logger: + """Setup default logger if none provided""" + logger = logging.getLogger('PDFProcessor') + logger.setLevel(logging.INFO) + return logger + + def encode_image(self, image_path: str) -> str: + """Convert image to base64 for API""" + with open(image_path, 'rb') as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + def analyze_image(self, image_path: str) -> Dict: + """Analyze a single image using Claude Vision API""" + try: + self.logger.debug(f"Analyzing image: {image_path}") + with open(image_path, 'rb') as img: + message = self.client.messages.create( + model="claude-3-opus-20240229", + max_tokens=1000, + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": """Analyze this magazine cover and extract the following metadata: + 1. Magazine Title + 2. Issue Date/Publication Date + 3. Publisher + 4. Issue Number + + Format your response as JSON with these exact keys: + { + "title": string, + "date": string, + "publisher": string, + "issue_number": string, + "confidence": "high|medium|low" + } + + If any field cannot be determined, use null. Set confidence based on how clear the information is.""" + }, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": self.encode_image(image_path) + } + } + ] + }] + ) + + # Parse the JSON response + response_text = message.content[0].text + metadata = json.loads(response_text) + self.logger.debug(f"Successfully extracted metadata from {image_path}") + return metadata + + except Exception as e: + self.logger.error(f"Error analyzing image {image_path}: {str(e)}") + return { + "title": None, + "date": None, + "publisher": None, + "issue_number": None, + "confidence": "error" + } + + def process_pdfs(self) -> List[Dict]: + """Process all PDFs in the input directory""" + pdf_files = list(self.input_dir.glob('*.pdf')) + results = [] + + # Setup progress bar + pbar = tqdm(pdf_files, desc="Processing PDFs", unit="file") + + for pdf_path in pbar: + try: + # Update progress bar description + pbar.set_description(f"Processing {pdf_path.name}") + + result = self.process_single_pdf(pdf_path) + results.append(result) + + # Update progress bar postfix with confidence + confidence = result.get('metadata', {}).get('confidence', 'unknown') + pbar.set_postfix(confidence=confidence) + + # Small delay to respect API rate limits + time.sleep(1) + + except Exception as e: + self.logger.error(f"Error processing {pdf_path}: {str(e)}") + results.append({ + 'pdf_path': str(pdf_path), + 'status': 'error', + 'error': str(e) + }) + + # Save results to JSON + results_file = self.output_dir / 'processing_results.json' + with open(results_file, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=4, ensure_ascii=False) + + self.logger.info(f"Results saved to {results_file}") + return results + + def process_single_pdf(self, pdf_path: Path) -> Dict: + """Process a single PDF file""" + self.logger.info(f"Processing: {pdf_path}") + + # Convert first page to image + self.logger.debug(f"Converting first page of {pdf_path} to image") + images = convert_from_path(pdf_path, first_page=1, last_page=1) + if not images: + raise Exception("Could not extract first page") + + # Save first page image + first_page = images[0] + image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" + first_page.save(str(image_path), 'JPEG') + self.logger.debug(f"Saved first page as image: {image_path}") + + # Analyze the image + metadata = self.analyze_image(str(image_path)) + + return { + 'pdf_path': str(pdf_path), + 'image_path': str(image_path), + 'metadata': metadata, + 'status': 'completed', + 'processed_at': datetime.now().isoformat() + } + +def setup_logging(output_dir: Path, debug: bool = False) -> logging.Logger: + """Setup logging configuration""" + logger = logging.getLogger('PDFProcessor') + logger.setLevel(logging.DEBUG if debug else logging.INFO) + + # Create handlers + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + + file_handler = logging.FileHandler( + output_dir / f'pdf_processor_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log' + ) + file_handler.setLevel(logging.DEBUG) + + # Create formatters + console_formatter = logging.Formatter('%(message)s') + file_formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + # Set formatters + console_handler.setFormatter(console_formatter) + file_handler.setFormatter(file_formatter) + + # Add handlers + logger.addHandler(console_handler) + logger.addHandler(file_handler) + + return logger def __init__(self, input_dir: str, output_dir: str, api_key: str): self.input_dir = Path(input_dir) self.output_dir = Path(output_dir) @@ -172,11 +347,17 @@ Examples: action='store_true', help='Keep temporary image files after processing') + parser.add_argument('--debug', + action='store_true', + help='Enable debug logging') + args = parser.parse_args() # Validate directories input_dir = Path(args.input_dir) output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logger = setup_logging(output_dir, args.debug) if not input_dir.exists(): print(f"Error: Input directory '{input_dir}' does not exist") @@ -190,43 +371,44 @@ Examples: sys.exit(1) try: + logger.info("Starting PDF processing...") # Initialize processor processor = PDFProcessor(str(input_dir), str(output_dir), api_key) # Modify glob pattern if in test mode if args.test: - print("Running in test mode - will process only first PDF file") + logger.info("Running in test mode - will process only first PDF file") pdf_files = list(input_dir.glob(args.pattern))[:1] else: pdf_files = list(input_dir.glob(args.pattern)) if not pdf_files: - print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'") + logger.info(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'") sys.exit(1) - print(f"Found {len(pdf_files)} PDF files to process") + logger.info(f"Found {len(pdf_files)} PDF files to process") # Process files results = processor.process_pdfs() # Cleanup temporary files unless --no-cleanup was specified if not args.no_cleanup: - print("Cleaning up temporary files...") + logger.info("Cleaning up temporary files...") for result in results: if 'image_path' in result: try: Path(result['image_path']).unlink() except Exception as e: - print(f"Warning: Could not delete temporary file {result['image_path']}: {e}") + logger.info(f"Warning: Could not delete temporary file {result['image_path']}: {e}") - print(f"\nProcessed {len(results)} PDF files") - print(f"Results saved to: {processor.output_dir}/processing_results.json") + logger.info(f"\nProcessed {len(results)} PDF files") + logger.info(f"Results saved to: {processor.output_dir}/processing_results.json") except KeyboardInterrupt: - print("\nOperation cancelled by user") + logger.info("\nOperation cancelled by user") sys.exit(1) except Exception as e: - print(f"Error: {str(e)}") + logger.error(f"Error: {str(e)}") sys.exit(1) if __name__ == "__main__":