pdf_processor.py aktualisiert
Including Antrhopic API
This commit is contained in:
parent
2a3e445c23
commit
d611992eed
1 changed files with 85 additions and 61 deletions
144
pdf_processor.py
144
pdf_processor.py
|
@ -4,36 +4,90 @@ import json
|
|||
from pathlib import Path
|
||||
import tempfile
|
||||
import base64
|
||||
import anthropic
|
||||
from typing import List, Dict
|
||||
import time
|
||||
|
||||
class PDFProcessor:
|
||||
def __init__(self, input_dir, output_dir):
|
||||
def __init__(self, input_dir: str, output_dir: str, api_key: str):
|
||||
self.input_dir = Path(input_dir)
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.temp_dir = Path(tempfile.mkdtemp())
|
||||
self.batch_size = 5 # Number of images to process at once
|
||||
self.client = anthropic.Client(api_key=api_key)
|
||||
|
||||
def encode_image(self, image_path):
|
||||
"""Convert image to base64 for analysis"""
|
||||
def encode_image(self, image_path: str) -> str:
|
||||
"""Convert image to base64 for API"""
|
||||
with open(image_path, 'rb') as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
def process_pdfs(self):
|
||||
def analyze_image(self, image_path: str) -> Dict:
|
||||
"""Analyze a single image using Claude Vision API"""
|
||||
try:
|
||||
with open(image_path, 'rb') as img:
|
||||
message = self.client.messages.create(
|
||||
model="claude-3-opus-20240229",
|
||||
max_tokens=1000,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": """Analyze this magazine cover and extract the following metadata:
|
||||
1. Magazine Title
|
||||
2. Issue Date/Publication Date
|
||||
3. Publisher
|
||||
4. Issue Number
|
||||
|
||||
Format your response as JSON with these exact keys:
|
||||
{
|
||||
"title": string,
|
||||
"date": string,
|
||||
"publisher": string,
|
||||
"issue_number": string,
|
||||
"confidence": "high|medium|low"
|
||||
}
|
||||
|
||||
If any field cannot be determined, use null. Set confidence based on how clear the information is."""
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": self.encode_image(image_path)
|
||||
}
|
||||
}
|
||||
]
|
||||
}]
|
||||
)
|
||||
|
||||
# Parse the JSON response
|
||||
response_text = message.content[0].text
|
||||
metadata = json.loads(response_text)
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error analyzing image {image_path}: {str(e)}")
|
||||
return {
|
||||
"title": None,
|
||||
"date": None,
|
||||
"publisher": None,
|
||||
"issue_number": None,
|
||||
"confidence": "error"
|
||||
}
|
||||
|
||||
def process_pdfs(self) -> List[Dict]:
|
||||
"""Process all PDFs in the input directory"""
|
||||
pdf_files = list(self.input_dir.glob('*.pdf'))
|
||||
results = []
|
||||
current_batch = []
|
||||
|
||||
for pdf_path in pdf_files:
|
||||
try:
|
||||
batch_item = self.prepare_single_pdf(pdf_path)
|
||||
current_batch.append(batch_item)
|
||||
|
||||
# Process batch when it reaches batch_size
|
||||
if len(current_batch) >= self.batch_size:
|
||||
self.process_batch(current_batch, results)
|
||||
current_batch = []
|
||||
|
||||
result = self.process_single_pdf(pdf_path)
|
||||
results.append(result)
|
||||
# Small delay to respect API rate limits
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
print(f"Error processing {pdf_path}: {str(e)}")
|
||||
results.append({
|
||||
|
@ -42,19 +96,15 @@ class PDFProcessor:
|
|||
'error': str(e)
|
||||
})
|
||||
|
||||
# Process remaining files in the last batch
|
||||
if current_batch:
|
||||
self.process_batch(current_batch, results)
|
||||
|
||||
# Save final results to JSON
|
||||
# Save results to JSON
|
||||
with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, indent=4, ensure_ascii=False)
|
||||
|
||||
return results
|
||||
|
||||
def prepare_single_pdf(self, pdf_path):
|
||||
"""Prepare a single PDF file for analysis"""
|
||||
print(f"Preparing: {pdf_path}")
|
||||
def process_single_pdf(self, pdf_path: Path) -> Dict:
|
||||
"""Process a single PDF file"""
|
||||
print(f"Processing: {pdf_path}")
|
||||
|
||||
# Convert first page to image
|
||||
images = convert_from_path(pdf_path, first_page=1, last_page=1)
|
||||
|
@ -66,52 +116,26 @@ class PDFProcessor:
|
|||
image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
|
||||
first_page.save(str(image_path), 'JPEG')
|
||||
|
||||
# Analyze the image
|
||||
metadata = self.analyze_image(str(image_path))
|
||||
|
||||
return {
|
||||
'pdf_path': str(pdf_path),
|
||||
'image_path': str(image_path)
|
||||
'image_path': str(image_path),
|
||||
'metadata': metadata,
|
||||
'status': 'completed'
|
||||
}
|
||||
|
||||
def process_batch(self, batch_items, results):
|
||||
"""Process a batch of prepared PDFs"""
|
||||
print(f"\nProcessing batch of {len(batch_items)} files...")
|
||||
|
||||
# Here you would interact with me (Claude) to analyze the images
|
||||
# For each image in the batch:
|
||||
for item in batch_items:
|
||||
image_path = item['image_path']
|
||||
pdf_path = item['pdf_path']
|
||||
|
||||
# Convert image to base64
|
||||
image_data = self.encode_image(image_path)
|
||||
|
||||
# You would need to ask me to analyze this image
|
||||
# For now, we'll save placeholder metadata
|
||||
metadata = {
|
||||
'title': None,
|
||||
'date': None,
|
||||
'publisher': None,
|
||||
'issue_number': None,
|
||||
'confidence': 'pending_analysis'
|
||||
}
|
||||
|
||||
results.append({
|
||||
'pdf_path': pdf_path,
|
||||
'image_path': str(image_path),
|
||||
'metadata': metadata,
|
||||
'status': 'pending_analysis'
|
||||
})
|
||||
|
||||
def save_metadata(self, results):
|
||||
"""Save the extracted metadata back to PDFs or to a database"""
|
||||
# TODO: Implement metadata saving functionality
|
||||
pass
|
||||
|
||||
def main():
|
||||
# Example usage
|
||||
# Get API key from environment variable
|
||||
api_key = os.getenv('ANTHROPIC_API_KEY')
|
||||
if not api_key:
|
||||
raise ValueError("ANTHROPIC_API_KEY environment variable not set")
|
||||
|
||||
input_dir = "path/to/pdfs"
|
||||
output_dir = "path/to/output"
|
||||
|
||||
processor = PDFProcessor(input_dir, output_dir)
|
||||
processor = PDFProcessor(input_dir, output_dir, api_key)
|
||||
results = processor.process_pdfs()
|
||||
|
||||
print(f"\nProcessed {len(results)} PDF files")
|
||||
|
|
Loading…
Add table
Reference in a new issue