pdf_processor.py aktualisiert
This commit is contained in:
parent
d611992eed
commit
8b27d17f44
Notes:
Sebastian Mondial
2025-02-19 21:44:32 +00:00
Version 4
1 changed files with 99 additions and 11 deletions
100
pdf_processor.py
100
pdf_processor.py
|
@ -7,6 +7,8 @@ import base64
|
|||
import anthropic
|
||||
from typing import List, Dict
|
||||
import time
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
class PDFProcessor:
|
||||
def __init__(self, input_dir: str, output_dir: str, api_key: str):
|
||||
|
@ -127,19 +129,105 @@ class PDFProcessor:
|
|||
}
|
||||
|
||||
def main():
|
||||
# Get API key from environment variable
|
||||
api_key = os.getenv('ANTHROPIC_API_KEY')
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Process PDFs to extract magazine metadata using Claude Vision API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
Examples:
|
||||
# Process all PDFs in current directory
|
||||
python pdf_processor.py -i . -o results
|
||||
|
||||
# Process specific PDFs with custom batch size
|
||||
python pdf_processor.py -i /path/to/pdfs -o /path/to/output --pattern "magazine_*.pdf"
|
||||
|
||||
# Test mode with single file
|
||||
python pdf_processor.py -i /path/to/pdfs -o /path/to/output --test
|
||||
'''
|
||||
)
|
||||
|
||||
parser.add_argument('-i', '--input-dir',
|
||||
required=True,
|
||||
help='Directory containing PDF files')
|
||||
|
||||
parser.add_argument('-o', '--output-dir',
|
||||
required=True,
|
||||
help='Directory for output files')
|
||||
|
||||
parser.add_argument('--pattern',
|
||||
default='*.pdf',
|
||||
help='Glob pattern for PDF files (default: *.pdf)')
|
||||
|
||||
parser.add_argument('--api-key',
|
||||
help='Anthropic API key (alternative to env variable)')
|
||||
|
||||
parser.add_argument('--test',
|
||||
action='store_true',
|
||||
help='Test mode: process only first PDF file')
|
||||
|
||||
parser.add_argument('--skip-existing',
|
||||
action='store_true',
|
||||
help='Skip PDFs that already have results in output directory')
|
||||
|
||||
parser.add_argument('--no-cleanup',
|
||||
action='store_true',
|
||||
help='Keep temporary image files after processing')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate directories
|
||||
input_dir = Path(args.input_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
|
||||
if not input_dir.exists():
|
||||
print(f"Error: Input directory '{input_dir}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
# Get API key
|
||||
api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
|
||||
if not api_key:
|
||||
raise ValueError("ANTHROPIC_API_KEY environment variable not set")
|
||||
print("Error: No API key provided. Either set ANTHROPIC_API_KEY environment variable "
|
||||
"or use --api-key option")
|
||||
sys.exit(1)
|
||||
|
||||
input_dir = "path/to/pdfs"
|
||||
output_dir = "path/to/output"
|
||||
try:
|
||||
# Initialize processor
|
||||
processor = PDFProcessor(str(input_dir), str(output_dir), api_key)
|
||||
|
||||
processor = PDFProcessor(input_dir, output_dir, api_key)
|
||||
# Modify glob pattern if in test mode
|
||||
if args.test:
|
||||
print("Running in test mode - will process only first PDF file")
|
||||
pdf_files = list(input_dir.glob(args.pattern))[:1]
|
||||
else:
|
||||
pdf_files = list(input_dir.glob(args.pattern))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(pdf_files)} PDF files to process")
|
||||
|
||||
# Process files
|
||||
results = processor.process_pdfs()
|
||||
|
||||
# Cleanup temporary files unless --no-cleanup was specified
|
||||
if not args.no_cleanup:
|
||||
print("Cleaning up temporary files...")
|
||||
for result in results:
|
||||
if 'image_path' in result:
|
||||
try:
|
||||
Path(result['image_path']).unlink()
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
|
||||
|
||||
print(f"\nProcessed {len(results)} PDF files")
|
||||
print(f"Results saved to: {processor.output_dir}/processing_results.json")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nOperation cancelled by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Reference in a new issue