Debugging & Log Handling added

More robustness
This commit is contained in:
Sebastian Mondial 2025-02-19 21:49:18 +00:00
parent 8b27d17f44
commit c9d2551b89

View file

@ -9,8 +9,183 @@ from typing import List, Dict
import time
import argparse
import sys
from tqdm import tqdm
import logging
from datetime import datetime
class PDFProcessor:
def __init__(self, input_dir: str, output_dir: str, api_key: str, logger: logging.Logger = None):
self.input_dir = Path(input_dir)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.temp_dir = Path(tempfile.mkdtemp())
self.client = anthropic.Client(api_key=api_key)
self.logger = logger or self._setup_default_logger()
def _setup_default_logger(self) -> logging.Logger:
"""Setup default logger if none provided"""
logger = logging.getLogger('PDFProcessor')
logger.setLevel(logging.INFO)
return logger
def encode_image(self, image_path: str) -> str:
"""Convert image to base64 for API"""
with open(image_path, 'rb') as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def analyze_image(self, image_path: str) -> Dict:
"""Analyze a single image using Claude Vision API"""
try:
self.logger.debug(f"Analyzing image: {image_path}")
with open(image_path, 'rb') as img:
message = self.client.messages.create(
model="claude-3-opus-20240229",
max_tokens=1000,
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": """Analyze this magazine cover and extract the following metadata:
1. Magazine Title
2. Issue Date/Publication Date
3. Publisher
4. Issue Number
Format your response as JSON with these exact keys:
{
"title": string,
"date": string,
"publisher": string,
"issue_number": string,
"confidence": "high|medium|low"
}
If any field cannot be determined, use null. Set confidence based on how clear the information is."""
},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": self.encode_image(image_path)
}
}
]
}]
)
# Parse the JSON response
response_text = message.content[0].text
metadata = json.loads(response_text)
self.logger.debug(f"Successfully extracted metadata from {image_path}")
return metadata
except Exception as e:
self.logger.error(f"Error analyzing image {image_path}: {str(e)}")
return {
"title": None,
"date": None,
"publisher": None,
"issue_number": None,
"confidence": "error"
}
def process_pdfs(self) -> List[Dict]:
"""Process all PDFs in the input directory"""
pdf_files = list(self.input_dir.glob('*.pdf'))
results = []
# Setup progress bar
pbar = tqdm(pdf_files, desc="Processing PDFs", unit="file")
for pdf_path in pbar:
try:
# Update progress bar description
pbar.set_description(f"Processing {pdf_path.name}")
result = self.process_single_pdf(pdf_path)
results.append(result)
# Update progress bar postfix with confidence
confidence = result.get('metadata', {}).get('confidence', 'unknown')
pbar.set_postfix(confidence=confidence)
# Small delay to respect API rate limits
time.sleep(1)
except Exception as e:
self.logger.error(f"Error processing {pdf_path}: {str(e)}")
results.append({
'pdf_path': str(pdf_path),
'status': 'error',
'error': str(e)
})
# Save results to JSON
results_file = self.output_dir / 'processing_results.json'
with open(results_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4, ensure_ascii=False)
self.logger.info(f"Results saved to {results_file}")
return results
def process_single_pdf(self, pdf_path: Path) -> Dict:
"""Process a single PDF file"""
self.logger.info(f"Processing: {pdf_path}")
# Convert first page to image
self.logger.debug(f"Converting first page of {pdf_path} to image")
images = convert_from_path(pdf_path, first_page=1, last_page=1)
if not images:
raise Exception("Could not extract first page")
# Save first page image
first_page = images[0]
image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
first_page.save(str(image_path), 'JPEG')
self.logger.debug(f"Saved first page as image: {image_path}")
# Analyze the image
metadata = self.analyze_image(str(image_path))
return {
'pdf_path': str(pdf_path),
'image_path': str(image_path),
'metadata': metadata,
'status': 'completed',
'processed_at': datetime.now().isoformat()
}
def setup_logging(output_dir: Path, debug: bool = False) -> logging.Logger:
"""Setup logging configuration"""
logger = logging.getLogger('PDFProcessor')
logger.setLevel(logging.DEBUG if debug else logging.INFO)
# Create handlers
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
file_handler = logging.FileHandler(
output_dir / f'pdf_processor_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
)
file_handler.setLevel(logging.DEBUG)
# Create formatters
console_formatter = logging.Formatter('%(message)s')
file_formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Set formatters
console_handler.setFormatter(console_formatter)
file_handler.setFormatter(file_formatter)
# Add handlers
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
def __init__(self, input_dir: str, output_dir: str, api_key: str):
self.input_dir = Path(input_dir)
self.output_dir = Path(output_dir)
@ -172,11 +347,17 @@ Examples:
action='store_true',
help='Keep temporary image files after processing')
parser.add_argument('--debug',
action='store_true',
help='Enable debug logging')
args = parser.parse_args()
# Validate directories
input_dir = Path(args.input_dir)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
logger = setup_logging(output_dir, args.debug)
if not input_dir.exists():
print(f"Error: Input directory '{input_dir}' does not exist")
@ -190,43 +371,44 @@ Examples:
sys.exit(1)
try:
logger.info("Starting PDF processing...")
# Initialize processor
processor = PDFProcessor(str(input_dir), str(output_dir), api_key)
# Modify glob pattern if in test mode
if args.test:
print("Running in test mode - will process only first PDF file")
logger.info("Running in test mode - will process only first PDF file")
pdf_files = list(input_dir.glob(args.pattern))[:1]
else:
pdf_files = list(input_dir.glob(args.pattern))
if not pdf_files:
print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
logger.info(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
sys.exit(1)
print(f"Found {len(pdf_files)} PDF files to process")
logger.info(f"Found {len(pdf_files)} PDF files to process")
# Process files
results = processor.process_pdfs()
# Cleanup temporary files unless --no-cleanup was specified
if not args.no_cleanup:
print("Cleaning up temporary files...")
logger.info("Cleaning up temporary files...")
for result in results:
if 'image_path' in result:
try:
Path(result['image_path']).unlink()
except Exception as e:
print(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
logger.info(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
print(f"\nProcessed {len(results)} PDF files")
print(f"Results saved to: {processor.output_dir}/processing_results.json")
logger.info(f"\nProcessed {len(results)} PDF files")
logger.info(f"Results saved to: {processor.output_dir}/processing_results.json")
except KeyboardInterrupt:
print("\nOperation cancelled by user")
logger.info("\nOperation cancelled by user")
sys.exit(1)
except Exception as e:
print(f"Error: {str(e)}")
logger.error(f"Error: {str(e)}")
sys.exit(1)
if __name__ == "__main__":