From 4c18bd2cf97abc9daa89ff1d59fab36094b54e34 Mon Sep 17 00:00:00 2001 From: sebastian Date: Wed, 19 Feb 2025 21:55:20 +0000 Subject: [PATCH] V6 - with metadata connector Write that metadata :-D --- pdf_processor.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pdf_processor.py b/pdf_processor.py index 41138ef..28a22ba 100644 --- a/pdf_processor.py +++ b/pdf_processor.py @@ -12,6 +12,8 @@ import sys from tqdm import tqdm import logging from datetime import datetime +from metadata_writer import PDFMetadataWriter + class PDFProcessor: def __init__(self, input_dir: str, output_dir: str, api_key: str, logger: logging.Logger = None): @@ -350,6 +352,15 @@ Examples: parser.add_argument('--debug', action='store_true', help='Enable debug logging') + + parser.add_argument('--write-metadata', + action='store_true', + help='Write extracted metadata back to PDF files') + + parser.add_argument('--no-backup', + action='store_true', + help='Skip creating backups when writing metadata') + args = parser.parse_args() @@ -390,6 +401,20 @@ Examples: # Process files results = processor.process_pdfs() + + if args.write_metadata: + logger.info("Writing metadata back to PDF files...") + writer = PDFMetadataWriter(logger) + stats = writer.batch_write_metadata(results, backup=not args.no_backup) + + logger.info("\nMetadata Writing Results:") + logger.info(f"Successfully updated: {stats['success_count']} files") + logger.info(f"Failed to update: {stats['failure_count']} files") + + if stats['failure_count'] > 0: + logger.info("\nFailed files:") + for failed_file in stats['failure']: + logger.info(f" - {failed_file}") # Cleanup temporary files unless --no-cleanup was specified if not args.no_cleanup: