From 39e61b7f688a400940e27807e2815f1c9825a743 Mon Sep 17 00:00:00 2001
From: sebastian <sebastian@noreply.localhost>
Date: Wed, 19 Feb 2025 21:53:01 +0000
Subject: [PATCH] =?UTF-8?q?metadata=5Fwriter.py=20hinzugef=C3=BCgt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Writes the metadata to the pdfs ...
---
 metadata_writer.py | 103 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 metadata_writer.py

diff --git a/metadata_writer.py b/metadata_writer.py
new file mode 100644
index 0000000..50bca08
--- /dev/null
+++ b/metadata_writer.py
@@ -0,0 +1,103 @@
+from PyPDF2 import PdfReader, PdfWriter
+from pathlib import Path
+from typing import Dict
+import logging
+import shutil
+from datetime import datetime
+
+class PDFMetadataWriter:
+    def __init__(self, logger: logging.Logger = None):
+        self.logger = logger or logging.getLogger('PDFMetadataWriter')
+
+    def write_metadata(self, pdf_path: str, metadata: Dict, backup: bool = True) -> bool:
+        """
+        Write metadata to PDF file.
+        
+        Args:
+            pdf_path: Path to the PDF file
+            metadata: Dictionary containing metadata
+            backup: Whether to create a backup of the original file
+            
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            pdf_path = Path(pdf_path)
+            
+            # Create backup if requested
+            if backup:
+                backup_path = pdf_path.parent / f"{pdf_path.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}{pdf_path.suffix}"
+                shutil.copy2(pdf_path, backup_path)
+                self.logger.info(f"Created backup at: {backup_path}")
+
+            # Read existing PDF
+            reader = PdfReader(pdf_path)
+            writer = PdfWriter()
+
+            # Copy all pages
+            for page in reader.pages:
+                writer.add_page(page)
+
+            # Prepare metadata
+            writer.add_metadata({
+                '/Title': metadata.get('title', ''),
+                '/Subject': f"Issue {metadata.get('issue_number', '')}",
+                '/Publisher': metadata.get('publisher', ''),
+                '/Keywords': f"Magazine, {metadata.get('title', '')}, Issue {metadata.get('issue_number', '')}",
+                '/CreationDate': metadata.get('date', ''),
+                
+                # Add custom metadata with raw extracted data
+                '/ExtractedMetadata': str(metadata),
+                '/ExtractionConfidence': metadata.get('confidence', 'unknown'),
+                '/ProcessedDate': datetime.now().isoformat()
+            })
+
+            # Write the modified PDF
+            temp_path = pdf_path.parent / f"{pdf_path.stem}_temp{pdf_path.suffix}"
+            with open(temp_path, 'wb') as output_file:
+                writer.write(output_file)
+
+            # Replace original file
+            temp_path.replace(pdf_path)
+            self.logger.info(f"Successfully updated metadata for: {pdf_path}")
+            
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Error writing metadata to {pdf_path}: {str(e)}")
+            return False
+
+    def batch_write_metadata(self, results: list, backup: bool = True) -> Dict:
+        """
+        Process a batch of results and write metadata to corresponding PDFs.
+        
+        Args:
+            results: List of processing results from PDFProcessor
+            backup: Whether to create backups of original files
+            
+        Returns:
+            Dict containing success and failure counts and lists
+        """
+        stats = {
+            'success': [],
+            'failure': [],
+            'success_count': 0,
+            'failure_count': 0
+        }
+
+        for result in results:
+            if result.get('status') != 'completed':
+                self.logger.warning(f"Skipping incomplete result for {result.get('pdf_path')}")
+                continue
+
+            pdf_path = result.get('pdf_path')
+            metadata = result.get('metadata')
+
+            if self.write_metadata(pdf_path, metadata, backup):
+                stats['success'].append(pdf_path)
+                stats['success_count'] += 1
+            else:
+                stats['failure'].append(pdf_path)
+                stats['failure_count'] += 1
+
+        return stats
\ No newline at end of file