pdf-mass-cleanuptools/metadata_writer.py
sebastian 39e61b7f68 metadata_writer.py hinzugefügt
Writes the metadata to the pdfs ...
2025-02-19 21:53:01 +00:00

103 lines
No EOL
3.7 KiB
Python

from PyPDF2 import PdfReader, PdfWriter
from pathlib import Path
from typing import Dict
import logging
import shutil
from datetime import datetime
class PDFMetadataWriter:
def __init__(self, logger: logging.Logger = None):
self.logger = logger or logging.getLogger('PDFMetadataWriter')
def write_metadata(self, pdf_path: str, metadata: Dict, backup: bool = True) -> bool:
"""
Write metadata to PDF file.
Args:
pdf_path: Path to the PDF file
metadata: Dictionary containing metadata
backup: Whether to create a backup of the original file
Returns:
bool: True if successful, False otherwise
"""
try:
pdf_path = Path(pdf_path)
# Create backup if requested
if backup:
backup_path = pdf_path.parent / f"{pdf_path.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}{pdf_path.suffix}"
shutil.copy2(pdf_path, backup_path)
self.logger.info(f"Created backup at: {backup_path}")
# Read existing PDF
reader = PdfReader(pdf_path)
writer = PdfWriter()
# Copy all pages
for page in reader.pages:
writer.add_page(page)
# Prepare metadata
writer.add_metadata({
'/Title': metadata.get('title', ''),
'/Subject': f"Issue {metadata.get('issue_number', '')}",
'/Publisher': metadata.get('publisher', ''),
'/Keywords': f"Magazine, {metadata.get('title', '')}, Issue {metadata.get('issue_number', '')}",
'/CreationDate': metadata.get('date', ''),
# Add custom metadata with raw extracted data
'/ExtractedMetadata': str(metadata),
'/ExtractionConfidence': metadata.get('confidence', 'unknown'),
'/ProcessedDate': datetime.now().isoformat()
})
# Write the modified PDF
temp_path = pdf_path.parent / f"{pdf_path.stem}_temp{pdf_path.suffix}"
with open(temp_path, 'wb') as output_file:
writer.write(output_file)
# Replace original file
temp_path.replace(pdf_path)
self.logger.info(f"Successfully updated metadata for: {pdf_path}")
return True
except Exception as e:
self.logger.error(f"Error writing metadata to {pdf_path}: {str(e)}")
return False
def batch_write_metadata(self, results: list, backup: bool = True) -> Dict:
"""
Process a batch of results and write metadata to corresponding PDFs.
Args:
results: List of processing results from PDFProcessor
backup: Whether to create backups of original files
Returns:
Dict containing success and failure counts and lists
"""
stats = {
'success': [],
'failure': [],
'success_count': 0,
'failure_count': 0
}
for result in results:
if result.get('status') != 'completed':
self.logger.warning(f"Skipping incomplete result for {result.get('pdf_path')}")
continue
pdf_path = result.get('pdf_path')
metadata = result.get('metadata')
if self.write_metadata(pdf_path, metadata, backup):
stats['success'].append(pdf_path)
stats['success_count'] += 1
else:
stats['failure'].append(pdf_path)
stats['failure_count'] += 1
return stats