metadata_writer.py hinzugefügt
Writes the metadata to the pdfs ...
This commit is contained in:
parent
368b43c457
commit
39e61b7f68
1 changed files with 103 additions and 0 deletions
103
metadata_writer.py
Normal file
103
metadata_writer.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
from PyPDF2 import PdfReader, PdfWriter
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
|
||||
class PDFMetadataWriter:
|
||||
def __init__(self, logger: logging.Logger = None):
|
||||
self.logger = logger or logging.getLogger('PDFMetadataWriter')
|
||||
|
||||
def write_metadata(self, pdf_path: str, metadata: Dict, backup: bool = True) -> bool:
|
||||
"""
|
||||
Write metadata to PDF file.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
metadata: Dictionary containing metadata
|
||||
backup: Whether to create a backup of the original file
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
pdf_path = Path(pdf_path)
|
||||
|
||||
# Create backup if requested
|
||||
if backup:
|
||||
backup_path = pdf_path.parent / f"{pdf_path.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}{pdf_path.suffix}"
|
||||
shutil.copy2(pdf_path, backup_path)
|
||||
self.logger.info(f"Created backup at: {backup_path}")
|
||||
|
||||
# Read existing PDF
|
||||
reader = PdfReader(pdf_path)
|
||||
writer = PdfWriter()
|
||||
|
||||
# Copy all pages
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
# Prepare metadata
|
||||
writer.add_metadata({
|
||||
'/Title': metadata.get('title', ''),
|
||||
'/Subject': f"Issue {metadata.get('issue_number', '')}",
|
||||
'/Publisher': metadata.get('publisher', ''),
|
||||
'/Keywords': f"Magazine, {metadata.get('title', '')}, Issue {metadata.get('issue_number', '')}",
|
||||
'/CreationDate': metadata.get('date', ''),
|
||||
|
||||
# Add custom metadata with raw extracted data
|
||||
'/ExtractedMetadata': str(metadata),
|
||||
'/ExtractionConfidence': metadata.get('confidence', 'unknown'),
|
||||
'/ProcessedDate': datetime.now().isoformat()
|
||||
})
|
||||
|
||||
# Write the modified PDF
|
||||
temp_path = pdf_path.parent / f"{pdf_path.stem}_temp{pdf_path.suffix}"
|
||||
with open(temp_path, 'wb') as output_file:
|
||||
writer.write(output_file)
|
||||
|
||||
# Replace original file
|
||||
temp_path.replace(pdf_path)
|
||||
self.logger.info(f"Successfully updated metadata for: {pdf_path}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error writing metadata to {pdf_path}: {str(e)}")
|
||||
return False
|
||||
|
||||
def batch_write_metadata(self, results: list, backup: bool = True) -> Dict:
|
||||
"""
|
||||
Process a batch of results and write metadata to corresponding PDFs.
|
||||
|
||||
Args:
|
||||
results: List of processing results from PDFProcessor
|
||||
backup: Whether to create backups of original files
|
||||
|
||||
Returns:
|
||||
Dict containing success and failure counts and lists
|
||||
"""
|
||||
stats = {
|
||||
'success': [],
|
||||
'failure': [],
|
||||
'success_count': 0,
|
||||
'failure_count': 0
|
||||
}
|
||||
|
||||
for result in results:
|
||||
if result.get('status') != 'completed':
|
||||
self.logger.warning(f"Skipping incomplete result for {result.get('pdf_path')}")
|
||||
continue
|
||||
|
||||
pdf_path = result.get('pdf_path')
|
||||
metadata = result.get('metadata')
|
||||
|
||||
if self.write_metadata(pdf_path, metadata, backup):
|
||||
stats['success'].append(pdf_path)
|
||||
stats['success_count'] += 1
|
||||
else:
|
||||
stats['failure'].append(pdf_path)
|
||||
stats['failure_count'] += 1
|
||||
|
||||
return stats
|
Loading…
Add table
Reference in a new issue