diff --git a/pdf_processor.py b/pdf_processor.py index e936e80..c9d4848 100644 --- a/pdf_processor.py +++ b/pdf_processor.py @@ -7,6 +7,8 @@ import base64 import anthropic from typing import List, Dict import time +import argparse +import sys class PDFProcessor: def __init__(self, input_dir: str, output_dir: str, api_key: str): @@ -127,19 +129,105 @@ class PDFProcessor: } def main(): - # Get API key from environment variable - api_key = os.getenv('ANTHROPIC_API_KEY') + parser = argparse.ArgumentParser( + description='Process PDFs to extract magazine metadata using Claude Vision API', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + # Process all PDFs in current directory + python pdf_processor.py -i . -o results + + # Process specific PDFs with custom batch size + python pdf_processor.py -i /path/to/pdfs -o /path/to/output --pattern "magazine_*.pdf" + + # Test mode with single file + python pdf_processor.py -i /path/to/pdfs -o /path/to/output --test + ''' + ) + + parser.add_argument('-i', '--input-dir', + required=True, + help='Directory containing PDF files') + + parser.add_argument('-o', '--output-dir', + required=True, + help='Directory for output files') + + parser.add_argument('--pattern', + default='*.pdf', + help='Glob pattern for PDF files (default: *.pdf)') + + parser.add_argument('--api-key', + help='Anthropic API key (alternative to env variable)') + + parser.add_argument('--test', + action='store_true', + help='Test mode: process only first PDF file') + + parser.add_argument('--skip-existing', + action='store_true', + help='Skip PDFs that already have results in output directory') + + parser.add_argument('--no-cleanup', + action='store_true', + help='Keep temporary image files after processing') + + args = parser.parse_args() + + # Validate directories + input_dir = Path(args.input_dir) + output_dir = Path(args.output_dir) + + if not input_dir.exists(): + print(f"Error: Input directory '{input_dir}' does not exist") + sys.exit(1) + + # Get API key + api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY') if not api_key: - raise ValueError("ANTHROPIC_API_KEY environment variable not set") + print("Error: No API key provided. Either set ANTHROPIC_API_KEY environment variable " + "or use --api-key option") + sys.exit(1) - input_dir = "path/to/pdfs" - output_dir = "path/to/output" - - processor = PDFProcessor(input_dir, output_dir, api_key) - results = processor.process_pdfs() - - print(f"\nProcessed {len(results)} PDF files") - print(f"Results saved to: {processor.output_dir}/processing_results.json") + try: + # Initialize processor + processor = PDFProcessor(str(input_dir), str(output_dir), api_key) + + # Modify glob pattern if in test mode + if args.test: + print("Running in test mode - will process only first PDF file") + pdf_files = list(input_dir.glob(args.pattern))[:1] + else: + pdf_files = list(input_dir.glob(args.pattern)) + + if not pdf_files: + print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'") + sys.exit(1) + + print(f"Found {len(pdf_files)} PDF files to process") + + # Process files + results = processor.process_pdfs() + + # Cleanup temporary files unless --no-cleanup was specified + if not args.no_cleanup: + print("Cleaning up temporary files...") + for result in results: + if 'image_path' in result: + try: + Path(result['image_path']).unlink() + except Exception as e: + print(f"Warning: Could not delete temporary file {result['image_path']}: {e}") + + print(f"\nProcessed {len(results)} PDF files") + print(f"Results saved to: {processor.output_dir}/processing_results.json") + + except KeyboardInterrupt: + print("\nOperation cancelled by user") + sys.exit(1) + except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) if __name__ == "__main__": main() \ No newline at end of file