pdf_processor.py aktualisiert
Entwicklung zur vision capability Nutzung
This commit is contained in:
parent
512852f8ef
commit
df4e34ce8f
1 changed files with 62 additions and 14 deletions
|
@ -3,25 +3,37 @@ import os
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import base64
|
||||||
|
|
||||||
class PDFProcessor:
|
class PDFProcessor:
|
||||||
def __init__(self, input_dir, output_dir):
|
def __init__(self, input_dir, output_dir):
|
||||||
self.input_dir = Path(input_dir)
|
self.input_dir = Path(input_dir)
|
||||||
self.output_dir = Path(output_dir)
|
self.output_dir = Path(output_dir)
|
||||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Create temp directory for images
|
|
||||||
self.temp_dir = Path(tempfile.mkdtemp())
|
self.temp_dir = Path(tempfile.mkdtemp())
|
||||||
|
self.batch_size = 5 # Number of images to process at once
|
||||||
|
|
||||||
|
def encode_image(self, image_path):
|
||||||
|
"""Convert image to base64 for analysis"""
|
||||||
|
with open(image_path, 'rb') as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||||
|
|
||||||
def process_pdfs(self):
|
def process_pdfs(self):
|
||||||
"""Process all PDFs in the input directory"""
|
"""Process all PDFs in the input directory"""
|
||||||
pdf_files = list(self.input_dir.glob('*.pdf'))
|
pdf_files = list(self.input_dir.glob('*.pdf'))
|
||||||
results = []
|
results = []
|
||||||
|
current_batch = []
|
||||||
|
|
||||||
for pdf_path in pdf_files:
|
for pdf_path in pdf_files:
|
||||||
try:
|
try:
|
||||||
result = self.process_single_pdf(pdf_path)
|
batch_item = self.prepare_single_pdf(pdf_path)
|
||||||
results.append(result)
|
current_batch.append(batch_item)
|
||||||
|
|
||||||
|
# Process batch when it reaches batch_size
|
||||||
|
if len(current_batch) >= self.batch_size:
|
||||||
|
self.process_batch(current_batch, results)
|
||||||
|
current_batch = []
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing {pdf_path}: {str(e)}")
|
print(f"Error processing {pdf_path}: {str(e)}")
|
||||||
results.append({
|
results.append({
|
||||||
|
@ -30,15 +42,19 @@ class PDFProcessor:
|
||||||
'error': str(e)
|
'error': str(e)
|
||||||
})
|
})
|
||||||
|
|
||||||
# Save results to JSON
|
# Process remaining files in the last batch
|
||||||
with open(self.output_dir / 'processing_results.json', 'w') as f:
|
if current_batch:
|
||||||
json.dump(results, f, indent=4)
|
self.process_batch(current_batch, results)
|
||||||
|
|
||||||
|
# Save final results to JSON
|
||||||
|
with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(results, f, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def process_single_pdf(self, pdf_path):
|
def prepare_single_pdf(self, pdf_path):
|
||||||
"""Process a single PDF file"""
|
"""Prepare a single PDF file for analysis"""
|
||||||
print(f"Processing: {pdf_path}")
|
print(f"Preparing: {pdf_path}")
|
||||||
|
|
||||||
# Convert first page to image
|
# Convert first page to image
|
||||||
images = convert_from_path(pdf_path, first_page=1, last_page=1)
|
images = convert_from_path(pdf_path, first_page=1, last_page=1)
|
||||||
|
@ -50,14 +66,46 @@ class PDFProcessor:
|
||||||
image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
|
image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
|
||||||
first_page.save(str(image_path), 'JPEG')
|
first_page.save(str(image_path), 'JPEG')
|
||||||
|
|
||||||
# TODO: This is where we'll integrate with the vision analysis
|
|
||||||
# For now, we'll just return the paths
|
|
||||||
return {
|
return {
|
||||||
'pdf_path': str(pdf_path),
|
'pdf_path': str(pdf_path),
|
||||||
'image_path': str(image_path),
|
'image_path': str(image_path)
|
||||||
'status': 'image_extracted'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def process_batch(self, batch_items, results):
|
||||||
|
"""Process a batch of prepared PDFs"""
|
||||||
|
print(f"\nProcessing batch of {len(batch_items)} files...")
|
||||||
|
|
||||||
|
# Here you would interact with me (Claude) to analyze the images
|
||||||
|
# For each image in the batch:
|
||||||
|
for item in batch_items:
|
||||||
|
image_path = item['image_path']
|
||||||
|
pdf_path = item['pdf_path']
|
||||||
|
|
||||||
|
# Convert image to base64
|
||||||
|
image_data = self.encode_image(image_path)
|
||||||
|
|
||||||
|
# You would need to ask me to analyze this image
|
||||||
|
# For now, we'll save placeholder metadata
|
||||||
|
metadata = {
|
||||||
|
'title': None,
|
||||||
|
'date': None,
|
||||||
|
'publisher': None,
|
||||||
|
'issue_number': None,
|
||||||
|
'confidence': 'pending_analysis'
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'pdf_path': pdf_path,
|
||||||
|
'image_path': str(image_path),
|
||||||
|
'metadata': metadata,
|
||||||
|
'status': 'pending_analysis'
|
||||||
|
})
|
||||||
|
|
||||||
|
def save_metadata(self, results):
|
||||||
|
"""Save the extracted metadata back to PDFs or to a database"""
|
||||||
|
# TODO: Implement metadata saving functionality
|
||||||
|
pass
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Example usage
|
# Example usage
|
||||||
input_dir = "path/to/pdfs"
|
input_dir = "path/to/pdfs"
|
||||||
|
|
Loading…
Add table
Reference in a new issue