Debugging & Log Handling added

More robustness
2025-02-19 21:49:18 +00:00 · 2025-02-19 21:49:18 +00:00 · c9d2551b89
commit c9d2551b89
parent 8b27d17f44
1 changed files with 191 additions and 9 deletions
--- a/pdf_processor.py
+++ b/pdf_processor.py
@ -9,8 +9,183 @@ from typing import List, Dict
 import time
 import argparse
 import sys
+from tqdm import tqdm
+import logging
+from datetime import datetime

 class PDFProcessor:
+    def __init__(self, input_dir: str, output_dir: str, api_key: str, logger: logging.Logger = None):
+        self.input_dir = Path(input_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.temp_dir = Path(tempfile.mkdtemp())
+        self.client = anthropic.Client(api_key=api_key)
+        self.logger = logger or self._setup_default_logger()
+        
+    def _setup_default_logger(self) -> logging.Logger:
+        """Setup default logger if none provided"""
+        logger = logging.getLogger('PDFProcessor')
+        logger.setLevel(logging.INFO)
+        return logger
+
+    def encode_image(self, image_path: str) -> str:
+        """Convert image to base64 for API"""
+        with open(image_path, 'rb') as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def analyze_image(self, image_path: str) -> Dict:
+        """Analyze a single image using Claude Vision API"""
+        try:
+            self.logger.debug(f"Analyzing image: {image_path}")
+            with open(image_path, 'rb') as img:
+                message = self.client.messages.create(
+                    model="claude-3-opus-20240229",
+                    max_tokens=1000,
+                    messages=[{
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": """Analyze this magazine cover and extract the following metadata:
+                                1. Magazine Title
+                                2. Issue Date/Publication Date
+                                3. Publisher
+                                4. Issue Number
+                                
+                                Format your response as JSON with these exact keys:
+                                {
+                                    "title": string,
+                                    "date": string,
+                                    "publisher": string,
+                                    "issue_number": string,
+                                    "confidence": "high|medium|low"
+                                }
+                                
+                                If any field cannot be determined, use null. Set confidence based on how clear the information is."""
+                            },
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/jpeg",
+                                    "data": self.encode_image(image_path)
+                                }
+                            }
+                        ]
+                    }]
+                )
+                
+                # Parse the JSON response
+                response_text = message.content[0].text
+                metadata = json.loads(response_text)
+                self.logger.debug(f"Successfully extracted metadata from {image_path}")
+                return metadata
+
+        except Exception as e:
+            self.logger.error(f"Error analyzing image {image_path}: {str(e)}")
+            return {
+                "title": None,
+                "date": None,
+                "publisher": None,
+                "issue_number": None,
+                "confidence": "error"
+            }
+
+    def process_pdfs(self) -> List[Dict]:
+        """Process all PDFs in the input directory"""
+        pdf_files = list(self.input_dir.glob('*.pdf'))
+        results = []
+        
+        # Setup progress bar
+        pbar = tqdm(pdf_files, desc="Processing PDFs", unit="file")
+        
+        for pdf_path in pbar:
+            try:
+                # Update progress bar description
+                pbar.set_description(f"Processing {pdf_path.name}")
+                
+                result = self.process_single_pdf(pdf_path)
+                results.append(result)
+                
+                # Update progress bar postfix with confidence
+                confidence = result.get('metadata', {}).get('confidence', 'unknown')
+                pbar.set_postfix(confidence=confidence)
+                
+                # Small delay to respect API rate limits
+                time.sleep(1)
+                
+            except Exception as e:
+                self.logger.error(f"Error processing {pdf_path}: {str(e)}")
+                results.append({
+                    'pdf_path': str(pdf_path),
+                    'status': 'error',
+                    'error': str(e)
+                })
+        
+        # Save results to JSON
+        results_file = self.output_dir / 'processing_results.json'
+        with open(results_file, 'w', encoding='utf-8') as f:
+            json.dump(results, f, indent=4, ensure_ascii=False)
+        
+        self.logger.info(f"Results saved to {results_file}")
+        return results
+
+    def process_single_pdf(self, pdf_path: Path) -> Dict:
+        """Process a single PDF file"""
+        self.logger.info(f"Processing: {pdf_path}")
+        
+        # Convert first page to image
+        self.logger.debug(f"Converting first page of {pdf_path} to image")
+        images = convert_from_path(pdf_path, first_page=1, last_page=1)
+        if not images:
+            raise Exception("Could not extract first page")
+        
+        # Save first page image
+        first_page = images[0]
+        image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
+        first_page.save(str(image_path), 'JPEG')
+        self.logger.debug(f"Saved first page as image: {image_path}")
+        
+        # Analyze the image
+        metadata = self.analyze_image(str(image_path))
+        
+        return {
+            'pdf_path': str(pdf_path),
+            'image_path': str(image_path),
+            'metadata': metadata,
+            'status': 'completed',
+            'processed_at': datetime.now().isoformat()
+        }
+
+def setup_logging(output_dir: Path, debug: bool = False) -> logging.Logger:
+    """Setup logging configuration"""
+    logger = logging.getLogger('PDFProcessor')
+    logger.setLevel(logging.DEBUG if debug else logging.INFO)
+    
+    # Create handlers
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    
+    file_handler = logging.FileHandler(
+        output_dir / f'pdf_processor_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
+    )
+    file_handler.setLevel(logging.DEBUG)
+    
+    # Create formatters
+    console_formatter = logging.Formatter('%(message)s')
+    file_formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    
+    # Set formatters
+    console_handler.setFormatter(console_formatter)
+    file_handler.setFormatter(file_formatter)
+    
+    # Add handlers
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    
+    return logger
    def __init__(self, input_dir: str, output_dir: str, api_key: str):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
@ -172,11 +347,17 @@ Examples:
                        action='store_true',
                        help='Keep temporary image files after processing')

+    parser.add_argument('--debug',
+                        action='store_true',
+                        help='Enable debug logging')
+    
    args = parser.parse_args()

    # Validate directories
    input_dir = Path(args.input_dir)
    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    logger = setup_logging(output_dir, args.debug)
    
    if not input_dir.exists():
        print(f"Error: Input directory '{input_dir}' does not exist")
@ -190,43 +371,44 @@ Examples:
        sys.exit(1)
    
    try:
+        logger.info("Starting PDF processing...")
        # Initialize processor
        processor = PDFProcessor(str(input_dir), str(output_dir), api_key)
        
        # Modify glob pattern if in test mode
        if args.test:
-            print("Running in test mode - will process only first PDF file")
+            logger.info("Running in test mode - will process only first PDF file")
            pdf_files = list(input_dir.glob(args.pattern))[:1]
        else:
            pdf_files = list(input_dir.glob(args.pattern))
        
        if not pdf_files:
-            print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
+            logger.info(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
            sys.exit(1)
        
-        print(f"Found {len(pdf_files)} PDF files to process")
+        logger.info(f"Found {len(pdf_files)} PDF files to process")
        
        # Process files
        results = processor.process_pdfs()
        
        # Cleanup temporary files unless --no-cleanup was specified
        if not args.no_cleanup:
-            print("Cleaning up temporary files...")
+            logger.info("Cleaning up temporary files...")
            for result in results:
                if 'image_path' in result:
                    try:
                        Path(result['image_path']).unlink()
                    except Exception as e:
-                        print(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
+                        logger.info(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
        
-        print(f"\nProcessed {len(results)} PDF files")
-        print(f"Results saved to: {processor.output_dir}/processing_results.json")
+        logger.info(f"\nProcessed {len(results)} PDF files")
+        logger.info(f"Results saved to: {processor.output_dir}/processing_results.json")
        
    except KeyboardInterrupt:
-        print("\nOperation cancelled by user")
+        logger.info("\nOperation cancelled by user")
        sys.exit(1)
    except Exception as e:
-        print(f"Error: {str(e)}")
+        logger.error(f"Error: {str(e)}")
        sys.exit(1)

 if __name__ == "__main__":