pdf_processor.py aktualisiert
This commit is contained in:
parent
d611992eed
commit
8b27d17f44
Notes:
Sebastian Mondial
2025-02-19 21:44:32 +00:00
Version 4
1 changed files with 99 additions and 11 deletions
106
pdf_processor.py
106
pdf_processor.py
|
@ -7,6 +7,8 @@ import base64
|
||||||
import anthropic
|
import anthropic
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
import time
|
import time
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
class PDFProcessor:
|
class PDFProcessor:
|
||||||
def __init__(self, input_dir: str, output_dir: str, api_key: str):
|
def __init__(self, input_dir: str, output_dir: str, api_key: str):
|
||||||
|
@ -127,19 +129,105 @@ class PDFProcessor:
|
||||||
}
|
}
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Get API key from environment variable
|
parser = argparse.ArgumentParser(
|
||||||
api_key = os.getenv('ANTHROPIC_API_KEY')
|
description='Process PDFs to extract magazine metadata using Claude Vision API',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog='''
|
||||||
|
Examples:
|
||||||
|
# Process all PDFs in current directory
|
||||||
|
python pdf_processor.py -i . -o results
|
||||||
|
|
||||||
|
# Process specific PDFs with custom batch size
|
||||||
|
python pdf_processor.py -i /path/to/pdfs -o /path/to/output --pattern "magazine_*.pdf"
|
||||||
|
|
||||||
|
# Test mode with single file
|
||||||
|
python pdf_processor.py -i /path/to/pdfs -o /path/to/output --test
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument('-i', '--input-dir',
|
||||||
|
required=True,
|
||||||
|
help='Directory containing PDF files')
|
||||||
|
|
||||||
|
parser.add_argument('-o', '--output-dir',
|
||||||
|
required=True,
|
||||||
|
help='Directory for output files')
|
||||||
|
|
||||||
|
parser.add_argument('--pattern',
|
||||||
|
default='*.pdf',
|
||||||
|
help='Glob pattern for PDF files (default: *.pdf)')
|
||||||
|
|
||||||
|
parser.add_argument('--api-key',
|
||||||
|
help='Anthropic API key (alternative to env variable)')
|
||||||
|
|
||||||
|
parser.add_argument('--test',
|
||||||
|
action='store_true',
|
||||||
|
help='Test mode: process only first PDF file')
|
||||||
|
|
||||||
|
parser.add_argument('--skip-existing',
|
||||||
|
action='store_true',
|
||||||
|
help='Skip PDFs that already have results in output directory')
|
||||||
|
|
||||||
|
parser.add_argument('--no-cleanup',
|
||||||
|
action='store_true',
|
||||||
|
help='Keep temporary image files after processing')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Validate directories
|
||||||
|
input_dir = Path(args.input_dir)
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
|
||||||
|
if not input_dir.exists():
|
||||||
|
print(f"Error: Input directory '{input_dir}' does not exist")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Get API key
|
||||||
|
api_key = args.api_key or os.getenv('ANTHROPIC_API_KEY')
|
||||||
if not api_key:
|
if not api_key:
|
||||||
raise ValueError("ANTHROPIC_API_KEY environment variable not set")
|
print("Error: No API key provided. Either set ANTHROPIC_API_KEY environment variable "
|
||||||
|
"or use --api-key option")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
input_dir = "path/to/pdfs"
|
try:
|
||||||
output_dir = "path/to/output"
|
# Initialize processor
|
||||||
|
processor = PDFProcessor(str(input_dir), str(output_dir), api_key)
|
||||||
|
|
||||||
processor = PDFProcessor(input_dir, output_dir, api_key)
|
# Modify glob pattern if in test mode
|
||||||
results = processor.process_pdfs()
|
if args.test:
|
||||||
|
print("Running in test mode - will process only first PDF file")
|
||||||
|
pdf_files = list(input_dir.glob(args.pattern))[:1]
|
||||||
|
else:
|
||||||
|
pdf_files = list(input_dir.glob(args.pattern))
|
||||||
|
|
||||||
print(f"\nProcessed {len(results)} PDF files")
|
if not pdf_files:
|
||||||
print(f"Results saved to: {processor.output_dir}/processing_results.json")
|
print(f"No PDF files found in '{input_dir}' matching pattern '{args.pattern}'")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Found {len(pdf_files)} PDF files to process")
|
||||||
|
|
||||||
|
# Process files
|
||||||
|
results = processor.process_pdfs()
|
||||||
|
|
||||||
|
# Cleanup temporary files unless --no-cleanup was specified
|
||||||
|
if not args.no_cleanup:
|
||||||
|
print("Cleaning up temporary files...")
|
||||||
|
for result in results:
|
||||||
|
if 'image_path' in result:
|
||||||
|
try:
|
||||||
|
Path(result['image_path']).unlink()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not delete temporary file {result['image_path']}: {e}")
|
||||||
|
|
||||||
|
print(f"\nProcessed {len(results)} PDF files")
|
||||||
|
print(f"Results saved to: {processor.output_dir}/processing_results.json")
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nOperation cancelled by user")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {str(e)}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
Loading…
Add table
Reference in a new issue