from PyPDF2 import PdfReader, PdfWriter from pathlib import Path from typing import Dict import logging import shutil from datetime import datetime class PDFMetadataWriter: def __init__(self, logger: logging.Logger = None): self.logger = logger or logging.getLogger('PDFMetadataWriter') def write_metadata(self, pdf_path: str, metadata: Dict, backup: bool = True) -> bool: """ Write metadata to PDF file. Args: pdf_path: Path to the PDF file metadata: Dictionary containing metadata backup: Whether to create a backup of the original file Returns: bool: True if successful, False otherwise """ try: pdf_path = Path(pdf_path) # Create backup if requested if backup: backup_path = pdf_path.parent / f"{pdf_path.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}{pdf_path.suffix}" shutil.copy2(pdf_path, backup_path) self.logger.info(f"Created backup at: {backup_path}") # Read existing PDF reader = PdfReader(pdf_path) writer = PdfWriter() # Copy all pages for page in reader.pages: writer.add_page(page) # Prepare metadata writer.add_metadata({ '/Title': metadata.get('title', ''), '/Subject': f"Issue {metadata.get('issue_number', '')}", '/Publisher': metadata.get('publisher', ''), '/Keywords': f"Magazine, {metadata.get('title', '')}, Issue {metadata.get('issue_number', '')}", '/CreationDate': metadata.get('date', ''), # Add custom metadata with raw extracted data '/ExtractedMetadata': str(metadata), '/ExtractionConfidence': metadata.get('confidence', 'unknown'), '/ProcessedDate': datetime.now().isoformat() }) # Write the modified PDF temp_path = pdf_path.parent / f"{pdf_path.stem}_temp{pdf_path.suffix}" with open(temp_path, 'wb') as output_file: writer.write(output_file) # Replace original file temp_path.replace(pdf_path) self.logger.info(f"Successfully updated metadata for: {pdf_path}") return True except Exception as e: self.logger.error(f"Error writing metadata to {pdf_path}: {str(e)}") return False def batch_write_metadata(self, results: list, backup: bool = True) -> Dict: """ Process a batch of results and write metadata to corresponding PDFs. Args: results: List of processing results from PDFProcessor backup: Whether to create backups of original files Returns: Dict containing success and failure counts and lists """ stats = { 'success': [], 'failure': [], 'success_count': 0, 'failure_count': 0 } for result in results: if result.get('status') != 'completed': self.logger.warning(f"Skipping incomplete result for {result.get('pdf_path')}") continue pdf_path = result.get('pdf_path') metadata = result.get('metadata') if self.write_metadata(pdf_path, metadata, backup): stats['success'].append(pdf_path) stats['success_count'] += 1 else: stats['failure'].append(pdf_path) stats['failure_count'] += 1 return stats