From 39e61b7f688a400940e27807e2815f1c9825a743 Mon Sep 17 00:00:00 2001 From: sebastian Date: Wed, 19 Feb 2025 21:53:01 +0000 Subject: [PATCH] =?UTF-8?q?metadata=5Fwriter.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Writes the metadata to the pdfs ... --- metadata_writer.py | 103 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 metadata_writer.py diff --git a/metadata_writer.py b/metadata_writer.py new file mode 100644 index 0000000..50bca08 --- /dev/null +++ b/metadata_writer.py @@ -0,0 +1,103 @@ +from PyPDF2 import PdfReader, PdfWriter +from pathlib import Path +from typing import Dict +import logging +import shutil +from datetime import datetime + +class PDFMetadataWriter: + def __init__(self, logger: logging.Logger = None): + self.logger = logger or logging.getLogger('PDFMetadataWriter') + + def write_metadata(self, pdf_path: str, metadata: Dict, backup: bool = True) -> bool: + """ + Write metadata to PDF file. + + Args: + pdf_path: Path to the PDF file + metadata: Dictionary containing metadata + backup: Whether to create a backup of the original file + + Returns: + bool: True if successful, False otherwise + """ + try: + pdf_path = Path(pdf_path) + + # Create backup if requested + if backup: + backup_path = pdf_path.parent / f"{pdf_path.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}{pdf_path.suffix}" + shutil.copy2(pdf_path, backup_path) + self.logger.info(f"Created backup at: {backup_path}") + + # Read existing PDF + reader = PdfReader(pdf_path) + writer = PdfWriter() + + # Copy all pages + for page in reader.pages: + writer.add_page(page) + + # Prepare metadata + writer.add_metadata({ + '/Title': metadata.get('title', ''), + '/Subject': f"Issue {metadata.get('issue_number', '')}", + '/Publisher': metadata.get('publisher', ''), + '/Keywords': f"Magazine, {metadata.get('title', '')}, Issue {metadata.get('issue_number', '')}", + '/CreationDate': metadata.get('date', ''), + + # Add custom metadata with raw extracted data + '/ExtractedMetadata': str(metadata), + '/ExtractionConfidence': metadata.get('confidence', 'unknown'), + '/ProcessedDate': datetime.now().isoformat() + }) + + # Write the modified PDF + temp_path = pdf_path.parent / f"{pdf_path.stem}_temp{pdf_path.suffix}" + with open(temp_path, 'wb') as output_file: + writer.write(output_file) + + # Replace original file + temp_path.replace(pdf_path) + self.logger.info(f"Successfully updated metadata for: {pdf_path}") + + return True + + except Exception as e: + self.logger.error(f"Error writing metadata to {pdf_path}: {str(e)}") + return False + + def batch_write_metadata(self, results: list, backup: bool = True) -> Dict: + """ + Process a batch of results and write metadata to corresponding PDFs. + + Args: + results: List of processing results from PDFProcessor + backup: Whether to create backups of original files + + Returns: + Dict containing success and failure counts and lists + """ + stats = { + 'success': [], + 'failure': [], + 'success_count': 0, + 'failure_count': 0 + } + + for result in results: + if result.get('status') != 'completed': + self.logger.warning(f"Skipping incomplete result for {result.get('pdf_path')}") + continue + + pdf_path = result.get('pdf_path') + metadata = result.get('metadata') + + if self.write_metadata(pdf_path, metadata, backup): + stats['success'].append(pdf_path) + stats['success_count'] += 1 + else: + stats['failure'].append(pdf_path) + stats['failure_count'] += 1 + + return stats \ No newline at end of file