pdf_processor.py aktualisiert

Version 4
2025-02-19 21:43:49 +00:00 · 2025-02-19 21:43:49 +00:00 · 8b27d17f44 · 2025-02-19 21:44:32 +00:00
commit 8b27d17f44
parent d611992eed
1 changed files with 99 additions and 11 deletions
--- a/pdf_processor.py
+++ b/pdf_processor.py
@ -7,6 +7,8 @@ import base64
 import anthropic
 from typing import List, Dict
 import time
 import argparse
 import sys
 class PDFProcessor:
    def __init__(self, input_dir: str, output_dir: str, api_key: str):
@ -127,19 +129,105 @@ class PDFProcessor:
        }
 def main():
-    # Get API key from environment variable
+    parser = argparse.ArgumentParser(
-    api_key = os.getenv('ANTHROPIC_API_KEY')
+        description='Process PDFs to extract magazine metadata using Claude Vision API',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
 Examples:
    # Process all PDFs in current directory
    python pdf_processor.py -i . -o results
    # Process specific PDFs with custom batch size
    python pdf_processor.py -i /path/to/pdfs -o /path/to/output --pattern "magazine_*.pdf"
    # Test mode with single file
    python pdf_processor.py -i /path/to/pdfs -o /path/to/output --test
        '''
    )
    parser.add_argument('-i', '--input-dir', 
                        required=True,
                        help='Directory containing PDF files')
    parser.add_argument('-o', '--output-dir', 
                        required=True,
                        help='Directory for output files')
    parser.add_argument('--pattern', 
                        default='*.pdf',
                        help='Glob pattern for PDF files (default: *.pdf)')
    parser.add_argument('--api-key',
                        help='Anthropic API key (alternative to env variable)')
    parser.add_argument('--test',
                        action='store_true',
                        help='Test mode: process only first PDF file')
    parser.add_argument('--skip-existing',
                        action='store_true',
                        help='Skip PDFs that already have results in output directory')
    parser.add_argument('--no-cleanup',
                        action='store_true',
                        help='Keep temporary image files after processing')
    args = parser.parse_args()
    # Validate directories
    input_dir = Path(args.input_dir)
    output_dir = Path(args.output_dir)
    if not input_dir.exists():
        print(f"Error: Input directory '{input_dir}' does not exist")
        sys.exit(1)
    # Get API key
    api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
    if not api_key:
-        raise ValueError("ANTHROPIC_API_KEY environment variable not set")
+        print("Error: No API key provided. Either set ANTHROPIC_API_KEY environment variable "
              "or use --api-key option")
        sys.exit(1)
-    input_dir = "path/to/pdfs"
+    try:
-    output_dir = "path/to/output"
+        # Initialize processor
        processor = PDFProcessor(str(input_dir), str(output_dir), api_key)
-    processor = PDFProcessor(input_dir, output_dir, api_key)
+        # Modify glob pattern if in test mode
-    results = processor.process_pdfs()
+        if args.test:
            print("Running in test mode - will process only first PDF file")
            pdf_files = list(input_dir.glob(args.pattern))[:1]
        else:
            pdf_files = list(input_dir.glob(args.pattern))
-    print(f"\nProcessed {len(results)} PDF files")
+        if not pdf_files:
-    print(f"Results saved to: {processor.output_dir}/processing_results.json")
+            print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
            sys.exit(1)
        print(f"Found {len(pdf_files)} PDF files to process")
        # Process files
        results = processor.process_pdfs()
        # Cleanup temporary files unless --no-cleanup was specified
        if not args.no_cleanup:
            print("Cleaning up temporary files...")
            for result in results:
                if 'image_path' in result:
                    try:
                        Path(result['image_path']).unlink()
                    except Exception as e:
                        print(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
        print(f"\nProcessed {len(results)} PDF files")
        print(f"Results saved to: {processor.output_dir}/processing_results.json")
    except KeyboardInterrupt:
        print("\nOperation cancelled by user")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {str(e)}")
        sys.exit(1)
 if __name__ == "__main__":
    main()