From 6d8e11772e6abadf9b872323fb41601e3d036197 Mon Sep 17 00:00:00 2001 From: sebastian Date: Wed, 19 Feb 2025 21:36:38 +0000 Subject: [PATCH] =?UTF-8?q?pdf=5Fprocessor.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Version 1 des PDF-Tools --- pdf_processor.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 pdf_processor.py diff --git a/pdf_processor.py b/pdf_processor.py new file mode 100644 index 0000000..2871bbf --- /dev/null +++ b/pdf_processor.py @@ -0,0 +1,73 @@ +from pdf2image import convert_from_path +import os +import json +from pathlib import Path +import tempfile + +class PDFProcessor: + def __init__(self, input_dir, output_dir): + self.input_dir = Path(input_dir) + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Create temp directory for images + self.temp_dir = Path(tempfile.mkdtemp()) + + def process_pdfs(self): + """Process all PDFs in the input directory""" + pdf_files = list(self.input_dir.glob('*.pdf')) + results = [] + + for pdf_path in pdf_files: + try: + result = self.process_single_pdf(pdf_path) + results.append(result) + except Exception as e: + print(f"Error processing {pdf_path}: {str(e)}") + results.append({ + 'pdf_path': str(pdf_path), + 'status': 'error', + 'error': str(e) + }) + + # Save results to JSON + with open(self.output_dir / 'processing_results.json', 'w') as f: + json.dump(results, f, indent=4) + + return results + + def process_single_pdf(self, pdf_path): + """Process a single PDF file""" + print(f"Processing: {pdf_path}") + + # Convert first page to image + images = convert_from_path(pdf_path, first_page=1, last_page=1) + if not images: + raise Exception("Could not extract first page") + + # Save first page image + first_page = images[0] + image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" + first_page.save(str(image_path), 'JPEG') + + # TODO: This is where we'll integrate with the vision analysis + # For now, we'll just return the paths + return { + 'pdf_path': str(pdf_path), + 'image_path': str(image_path), + 'status': 'image_extracted' + } + +def main(): + # Example usage + input_dir = "path/to/pdfs" + output_dir = "path/to/output" + + processor = PDFProcessor(input_dir, output_dir) + results = processor.process_pdfs() + + print(f"\nProcessed {len(results)} PDF files") + print(f"Results saved to: {processor.output_dir}/processing_results.json") + +if __name__ == "__main__": + main() \ No newline at end of file