103 lines
No EOL
3.7 KiB
Python
103 lines
No EOL
3.7 KiB
Python
from PyPDF2 import PdfReader, PdfWriter
|
|
from pathlib import Path
|
|
from typing import Dict
|
|
import logging
|
|
import shutil
|
|
from datetime import datetime
|
|
|
|
class PDFMetadataWriter:
|
|
def __init__(self, logger: logging.Logger = None):
|
|
self.logger = logger or logging.getLogger('PDFMetadataWriter')
|
|
|
|
def write_metadata(self, pdf_path: str, metadata: Dict, backup: bool = True) -> bool:
|
|
"""
|
|
Write metadata to PDF file.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
metadata: Dictionary containing metadata
|
|
backup: Whether to create a backup of the original file
|
|
|
|
Returns:
|
|
bool: True if successful, False otherwise
|
|
"""
|
|
try:
|
|
pdf_path = Path(pdf_path)
|
|
|
|
# Create backup if requested
|
|
if backup:
|
|
backup_path = pdf_path.parent / f"{pdf_path.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}{pdf_path.suffix}"
|
|
shutil.copy2(pdf_path, backup_path)
|
|
self.logger.info(f"Created backup at: {backup_path}")
|
|
|
|
# Read existing PDF
|
|
reader = PdfReader(pdf_path)
|
|
writer = PdfWriter()
|
|
|
|
# Copy all pages
|
|
for page in reader.pages:
|
|
writer.add_page(page)
|
|
|
|
# Prepare metadata
|
|
writer.add_metadata({
|
|
'/Title': metadata.get('title', ''),
|
|
'/Subject': f"Issue {metadata.get('issue_number', '')}",
|
|
'/Publisher': metadata.get('publisher', ''),
|
|
'/Keywords': f"Magazine, {metadata.get('title', '')}, Issue {metadata.get('issue_number', '')}",
|
|
'/CreationDate': metadata.get('date', ''),
|
|
|
|
# Add custom metadata with raw extracted data
|
|
'/ExtractedMetadata': str(metadata),
|
|
'/ExtractionConfidence': metadata.get('confidence', 'unknown'),
|
|
'/ProcessedDate': datetime.now().isoformat()
|
|
})
|
|
|
|
# Write the modified PDF
|
|
temp_path = pdf_path.parent / f"{pdf_path.stem}_temp{pdf_path.suffix}"
|
|
with open(temp_path, 'wb') as output_file:
|
|
writer.write(output_file)
|
|
|
|
# Replace original file
|
|
temp_path.replace(pdf_path)
|
|
self.logger.info(f"Successfully updated metadata for: {pdf_path}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error writing metadata to {pdf_path}: {str(e)}")
|
|
return False
|
|
|
|
def batch_write_metadata(self, results: list, backup: bool = True) -> Dict:
|
|
"""
|
|
Process a batch of results and write metadata to corresponding PDFs.
|
|
|
|
Args:
|
|
results: List of processing results from PDFProcessor
|
|
backup: Whether to create backups of original files
|
|
|
|
Returns:
|
|
Dict containing success and failure counts and lists
|
|
"""
|
|
stats = {
|
|
'success': [],
|
|
'failure': [],
|
|
'success_count': 0,
|
|
'failure_count': 0
|
|
}
|
|
|
|
for result in results:
|
|
if result.get('status') != 'completed':
|
|
self.logger.warning(f"Skipping incomplete result for {result.get('pdf_path')}")
|
|
continue
|
|
|
|
pdf_path = result.get('pdf_path')
|
|
metadata = result.get('metadata')
|
|
|
|
if self.write_metadata(pdf_path, metadata, backup):
|
|
stats['success'].append(pdf_path)
|
|
stats['success_count'] += 1
|
|
else:
|
|
stats['failure'].append(pdf_path)
|
|
stats['failure_count'] += 1
|
|
|
|
return stats |