pdf_processor.py aktualisiert

This commit is contained in:
Sebastian Mondial 2025-02-19 21:43:49 +00:00
parent d611992eed
commit 8b27d17f44
Notes: Sebastian Mondial 2025-02-19 21:44:32 +00:00
Version 4

View file

@ -7,6 +7,8 @@ import base64
import anthropic
from typing import List, Dict
import time
import argparse
import sys
class PDFProcessor:
def __init__(self, input_dir: str, output_dir: str, api_key: str):
@ -127,19 +129,105 @@ class PDFProcessor:
}
def main():
# Get API key from environment variable
api_key = os.getenv('ANTHROPIC_API_KEY')
parser = argparse.ArgumentParser(
description='Process PDFs to extract magazine metadata using Claude Vision API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Examples:
# Process all PDFs in current directory
python pdf_processor.py -i . -o results
# Process specific PDFs with custom batch size
python pdf_processor.py -i /path/to/pdfs -o /path/to/output --pattern "magazine_*.pdf"
# Test mode with single file
python pdf_processor.py -i /path/to/pdfs -o /path/to/output --test
'''
)
parser.add_argument('-i', '--input-dir',
required=True,
help='Directory containing PDF files')
parser.add_argument('-o', '--output-dir',
required=True,
help='Directory for output files')
parser.add_argument('--pattern',
default='*.pdf',
help='Glob pattern for PDF files (default: *.pdf)')
parser.add_argument('--api-key',
help='Anthropic API key (alternative to env variable)')
parser.add_argument('--test',
action='store_true',
help='Test mode: process only first PDF file')
parser.add_argument('--skip-existing',
action='store_true',
help='Skip PDFs that already have results in output directory')
parser.add_argument('--no-cleanup',
action='store_true',
help='Keep temporary image files after processing')
args = parser.parse_args()
# Validate directories
input_dir = Path(args.input_dir)
output_dir = Path(args.output_dir)
if not input_dir.exists():
print(f"Error: Input directory '{input_dir}' does not exist")
sys.exit(1)
# Get API key
api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
if not api_key:
raise ValueError("ANTHROPIC_API_KEY environment variable not set")
print("Error: No API key provided. Either set ANTHROPIC_API_KEY environment variable "
"or use --api-key option")
sys.exit(1)
input_dir = "path/to/pdfs"
output_dir = "path/to/output"
try:
# Initialize processor
processor = PDFProcessor(str(input_dir), str(output_dir), api_key)
processor = PDFProcessor(input_dir, output_dir, api_key)
# Modify glob pattern if in test mode
if args.test:
print("Running in test mode - will process only first PDF file")
pdf_files = list(input_dir.glob(args.pattern))[:1]
else:
pdf_files = list(input_dir.glob(args.pattern))
if not pdf_files:
print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
sys.exit(1)
print(f"Found {len(pdf_files)} PDF files to process")
# Process files
results = processor.process_pdfs()
# Cleanup temporary files unless --no-cleanup was specified
if not args.no_cleanup:
print("Cleaning up temporary files...")
for result in results:
if 'image_path' in result:
try:
Path(result['image_path']).unlink()
except Exception as e:
print(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
print(f"\nProcessed {len(results)} PDF files")
print(f"Results saved to: {processor.output_dir}/processing_results.json")
except KeyboardInterrupt:
print("\nOperation cancelled by user")
sys.exit(1)
except Exception as e:
print(f"Error: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
main()