pdf-mass-cleanuptools/pdf_processor.py

from pdf2image import convert_from_path
import os
import json
from pathlib import Path
import tempfile
import base64
import anthropic
from typing import List, Dict
import time

class PDFProcessor:
    def __init__(self, input_dir: str, output_dir: str, api_key: str):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.temp_dir = Path(tempfile.mkdtemp())
        self.client = anthropic.Client(api_key=api_key)

    def encode_image(self, image_path: str) -> str:
        """Convert image to base64 for API"""
        with open(image_path, 'rb') as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def analyze_image(self, image_path: str) -> Dict:
        """Analyze a single image using Claude Vision API"""
        try:
            with open(image_path, 'rb') as img:
                message = self.client.messages.create(
                    model="claude-3-opus-20240229",
                    max_tokens=1000,
                    messages=[{
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": """Analyze this magazine cover and extract the following metadata:
                                1. Magazine Title
                                2. Issue Date/Publication Date
                                3. Publisher
                                4. Issue Number

                                Format your response as JSON with these exact keys:
                                {
                                    "title": string,
                                    "date": string,
                                    "publisher": string,
                                    "issue_number": string,
                                    "confidence": "high|medium|low"
                                }

                                If any field cannot be determined, use null. Set confidence based on how clear the information is."""
                            },
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/jpeg",
                                    "data": self.encode_image(image_path)
                                }
                            }
                        ]
                    }]
                )

                # Parse the JSON response
                response_text = message.content[0].text
                metadata = json.loads(response_text)
                return metadata

        except Exception as e:
            print(f"Error analyzing image {image_path}: {str(e)}")
            return {
                "title": None,
                "date": None,
                "publisher": None,
                "issue_number": None,
                "confidence": "error"
            }

    def process_pdfs(self) -> List[Dict]:
        """Process all PDFs in the input directory"""
        pdf_files = list(self.input_dir.glob('*.pdf'))
        results = []

        for pdf_path in pdf_files:
            try:
                result = self.process_single_pdf(pdf_path)
                results.append(result)
                # Small delay to respect API rate limits
                time.sleep(1)
            except Exception as e:
                print(f"Error processing {pdf_path}: {str(e)}")
                results.append({
                    'pdf_path': str(pdf_path),
                    'status': 'error',
                    'error': str(e)
                })

        # Save results to JSON
        with open(self.output_dir / 'processing_results.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=4, ensure_ascii=False)

        return results

    def process_single_pdf(self, pdf_path: Path) -> Dict:
        """Process a single PDF file"""
        print(f"Processing: {pdf_path}")

        # Convert first page to image
        images = convert_from_path(pdf_path, first_page=1, last_page=1)
        if not images:
            raise Exception("Could not extract first page")

        # Save first page image
        first_page = images[0]
        image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
        first_page.save(str(image_path), 'JPEG')

        # Analyze the image
        metadata = self.analyze_image(str(image_path))

        return {
            'pdf_path': str(pdf_path),
            'image_path': str(image_path),
            'metadata': metadata,
            'status': 'completed'
        }

def main():
    # Get API key from environment variable
    api_key = os.getenv('ANTHROPIC_API_KEY')
    if not api_key:
        raise ValueError("ANTHROPIC_API_KEY environment variable not set")

    input_dir = "path/to/pdfs"
    output_dir = "path/to/output"

    processor = PDFProcessor(input_dir, output_dir, api_key)
    results = processor.process_pdfs()

    print(f"\nProcessed {len(results)} PDF files")
    print(f"Results saved to: {processor.output_dir}/processing_results.json")

if __name__ == "__main__":
    main()