From 6d8e11772e6abadf9b872323fb41601e3d036197 Mon Sep 17 00:00:00 2001
From: sebastian <sebastian.mondial@me.com>
Date: Wed, 19 Feb 2025 21:36:38 +0000
Subject: [PATCH] =?UTF-8?q?pdf=5Fprocessor.py=20hinzugef=C3=BCgt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Version 1 des PDF-Tools
---
 pdf_processor.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 pdf_processor.py

diff --git a/pdf_processor.py b/pdf_processor.py
new file mode 100644
index 0000000..2871bbf
--- /dev/null
+++ b/pdf_processor.py
@@ -0,0 +1,73 @@
+from pdf2image import convert_from_path
+import os
+import json
+from pathlib import Path
+import tempfile
+
+class PDFProcessor:
+    def __init__(self, input_dir, output_dir):
+        self.input_dir = Path(input_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Create temp directory for images
+        self.temp_dir = Path(tempfile.mkdtemp())
+
+    def process_pdfs(self):
+        """Process all PDFs in the input directory"""
+        pdf_files = list(self.input_dir.glob('*.pdf'))
+        results = []
+        
+        for pdf_path in pdf_files:
+            try:
+                result = self.process_single_pdf(pdf_path)
+                results.append(result)
+            except Exception as e:
+                print(f"Error processing {pdf_path}: {str(e)}")
+                results.append({
+                    'pdf_path': str(pdf_path),
+                    'status': 'error',
+                    'error': str(e)
+                })
+        
+        # Save results to JSON
+        with open(self.output_dir / 'processing_results.json', 'w') as f:
+            json.dump(results, f, indent=4)
+        
+        return results
+
+    def process_single_pdf(self, pdf_path):
+        """Process a single PDF file"""
+        print(f"Processing: {pdf_path}")
+        
+        # Convert first page to image
+        images = convert_from_path(pdf_path, first_page=1, last_page=1)
+        if not images:
+            raise Exception("Could not extract first page")
+        
+        # Save first page image
+        first_page = images[0]
+        image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
+        first_page.save(str(image_path), 'JPEG')
+        
+        # TODO: This is where we'll integrate with the vision analysis
+        # For now, we'll just return the paths
+        return {
+            'pdf_path': str(pdf_path),
+            'image_path': str(image_path),
+            'status': 'image_extracted'
+        }
+
+def main():
+    # Example usage
+    input_dir = "path/to/pdfs"
+    output_dir = "path/to/output"
+    
+    processor = PDFProcessor(input_dir, output_dir)
+    results = processor.process_pdfs()
+    
+    print(f"\nProcessed {len(results)} PDF files")
+    print(f"Results saved to: {processor.output_dir}/processing_results.json")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file