V6 - with metadata connector

Write that metadata :-D
This commit is contained in:
Sebastian Mondial 2025-02-19 21:55:20 +00:00
parent 39e61b7f68
commit 4c18bd2cf9

View file

@ -12,6 +12,8 @@ import sys
from tqdm import tqdm from tqdm import tqdm
import logging import logging
from datetime import datetime from datetime import datetime
from metadata_writer import PDFMetadataWriter
class PDFProcessor: class PDFProcessor:
def __init__(self, input_dir: str, output_dir: str, api_key: str, logger: logging.Logger = None): def __init__(self, input_dir: str, output_dir: str, api_key: str, logger: logging.Logger = None):
@ -350,6 +352,15 @@ Examples:
parser.add_argument('--debug', parser.add_argument('--debug',
action='store_true', action='store_true',
help='Enable debug logging') help='Enable debug logging')
parser.add_argument('--write-metadata',
action='store_true',
help='Write extracted metadata back to PDF files')
parser.add_argument('--no-backup',
action='store_true',
help='Skip creating backups when writing metadata')
args = parser.parse_args() args = parser.parse_args()
@ -390,6 +401,20 @@ Examples:
# Process files # Process files
results = processor.process_pdfs() results = processor.process_pdfs()
if args.write_metadata:
logger.info("Writing metadata back to PDF files...")
writer = PDFMetadataWriter(logger)
stats = writer.batch_write_metadata(results, backup=not args.no_backup)
logger.info("\nMetadata Writing Results:")
logger.info(f"Successfully updated: {stats['success_count']} files")
logger.info(f"Failed to update: {stats['failure_count']} files")
if stats['failure_count'] > 0:
logger.info("\nFailed files:")
for failed_file in stats['failure']:
logger.info(f" - {failed_file}")
# Cleanup temporary files unless --no-cleanup was specified # Cleanup temporary files unless --no-cleanup was specified
if not args.no_cleanup: if not args.no_cleanup: