pdf_processor.py aktualisiert

Entwicklung zur vision capability Nutzung
This commit is contained in:
Sebastian Mondial 2025-02-19 21:38:51 +00:00
parent 512852f8ef
commit df4e34ce8f

View file

@ -3,25 +3,37 @@ import os
import json import json
from pathlib import Path from pathlib import Path
import tempfile import tempfile
import base64
class PDFProcessor: class PDFProcessor:
def __init__(self, input_dir, output_dir): def __init__(self, input_dir, output_dir):
self.input_dir = Path(input_dir) self.input_dir = Path(input_dir)
self.output_dir = Path(output_dir) self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True) self.output_dir.mkdir(parents=True, exist_ok=True)
# Create temp directory for images
self.temp_dir = Path(tempfile.mkdtemp()) self.temp_dir = Path(tempfile.mkdtemp())
self.batch_size = 5 # Number of images to process at once
def encode_image(self, image_path):
"""Convert image to base64 for analysis"""
with open(image_path, 'rb') as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def process_pdfs(self): def process_pdfs(self):
"""Process all PDFs in the input directory""" """Process all PDFs in the input directory"""
pdf_files = list(self.input_dir.glob('*.pdf')) pdf_files = list(self.input_dir.glob('*.pdf'))
results = [] results = []
current_batch = []
for pdf_path in pdf_files: for pdf_path in pdf_files:
try: try:
result = self.process_single_pdf(pdf_path) batch_item = self.prepare_single_pdf(pdf_path)
results.append(result) current_batch.append(batch_item)
# Process batch when it reaches batch_size
if len(current_batch) >= self.batch_size:
self.process_batch(current_batch, results)
current_batch = []
except Exception as e: except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}") print(f"Error processing {pdf_path}: {str(e)}")
results.append({ results.append({
@ -30,15 +42,19 @@ class PDFProcessor:
'error': str(e) 'error': str(e)
}) })
# Save results to JSON # Process remaining files in the last batch
with open(self.output_dir / 'processing_results.json', 'w') as f: if current_batch:
json.dump(results, f, indent=4) self.process_batch(current_batch, results)
# Save final results to JSON
with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4, ensure_ascii=False)
return results return results
def process_single_pdf(self, pdf_path): def prepare_single_pdf(self, pdf_path):
"""Process a single PDF file""" """Prepare a single PDF file for analysis"""
print(f"Processing: {pdf_path}") print(f"Preparing: {pdf_path}")
# Convert first page to image # Convert first page to image
images = convert_from_path(pdf_path, first_page=1, last_page=1) images = convert_from_path(pdf_path, first_page=1, last_page=1)
@ -50,14 +66,46 @@ class PDFProcessor:
image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg" image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
first_page.save(str(image_path), 'JPEG') first_page.save(str(image_path), 'JPEG')
# TODO: This is where we'll integrate with the vision analysis
# For now, we'll just return the paths
return { return {
'pdf_path': str(pdf_path), 'pdf_path': str(pdf_path),
'image_path': str(image_path), 'image_path': str(image_path)
'status': 'image_extracted'
} }
def process_batch(self, batch_items, results):
"""Process a batch of prepared PDFs"""
print(f"\nProcessing batch of {len(batch_items)} files...")
# Here you would interact with me (Claude) to analyze the images
# For each image in the batch:
for item in batch_items:
image_path = item['image_path']
pdf_path = item['pdf_path']
# Convert image to base64
image_data = self.encode_image(image_path)
# You would need to ask me to analyze this image
# For now, we'll save placeholder metadata
metadata = {
'title': None,
'date': None,
'publisher': None,
'issue_number': None,
'confidence': 'pending_analysis'
}
results.append({
'pdf_path': pdf_path,
'image_path': str(image_path),
'metadata': metadata,
'status': 'pending_analysis'
})
def save_metadata(self, results):
"""Save the extracted metadata back to PDFs or to a database"""
# TODO: Implement metadata saving functionality
pass
def main(): def main():
# Example usage # Example usage
input_dir = "path/to/pdfs" input_dir = "path/to/pdfs"