pdf-mass-cleanuptools/metadata_reviewer.py

189 lines
No EOL
7.2 KiB
Python

import json
from pathlib import Path
import argparse
import sys
from typing import Dict, List
import logging
from metadata_writer import PDFMetadataWriter
from rich.console import Console
from rich.table import Table
from rich.prompt import Prompt, Confirm
from rich.panel import Panel
import tempfile
from pdf2image import convert_from_path
import os
from rich.progress import Progress
class MetadataReviewer:
def __init__(self, results_file: str, logger: logging.Logger = None):
self.results_file = Path(results_file)
self.logger = logger or logging.getLogger('MetadataReviewer')
self.console = Console()
self.temp_dir = Path(tempfile.mkdtemp())
def load_results(self) -> List[Dict]:
"""Load results from JSON file"""
try:
with open(self.results_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
self.logger.error(f"Error loading results file: {str(e)}")
sys.exit(1)
def display_metadata(self, result: Dict):
"""Display metadata in a formatted table"""
metadata = result.get('metadata', {})
table = Table(title=f"Metadata for: {Path(result['pdf_path']).name}")
table.add_column("Field", style="cyan")
table.add_column("Value", style="yellow")
table.add_column("Confidence", style="green")
fields = ['title', 'date', 'publisher', 'issue_number']
for field in fields:
table.add_row(
field.replace('_', ' ').title(),
str(metadata.get(field, 'N/A')),
metadata.get('confidence', 'unknown')
)
self.console.print(table)
def edit_metadata(self, metadata: Dict) -> Dict:
"""Interactively edit metadata"""
new_metadata = metadata.copy()
self.console.print("\n[yellow]Edit metadata (press Enter to keep current value):[/yellow]")
fields = ['title', 'date', 'publisher', 'issue_number']
for field in fields:
current_value = metadata.get(field, '')
new_value = Prompt.ask(
f"{field.replace('_', ' ').title()}",
default=str(current_value) if current_value else ''
)
new_metadata[field] = new_value if new_value else current_value
# Allow confidence adjustment
confidence_options = ['high', 'medium', 'low']
current_confidence = metadata.get('confidence', 'medium')
new_confidence = Prompt.ask(
"Confidence",
choices=confidence_options,
default=current_confidence
)
new_metadata['confidence'] = new_confidence
return new_metadata
def show_preview(self, pdf_path: str):
"""Show first page preview if possible"""
try:
# Convert first page to image
images = convert_from_path(pdf_path, first_page=1, last_page=1)
if images:
preview_path = self.temp_dir / f"preview_{Path(pdf_path).stem}.jpg"
images[0].save(str(preview_path))
# If running in compatible terminal, show image path
self.console.print(f"\n[blue]Preview saved to: {preview_path}[/blue]")
# On compatible systems, try to open the image
if sys.platform == 'darwin': # macOS
os.system(f'open "{preview_path}"')
elif sys.platform == 'linux':
os.system(f'xdg-open "{preview_path}"')
elif sys.platform == 'win32':
os.system(f'start "" "{preview_path}"')
except Exception as e:
self.logger.debug(f"Could not create preview: {str(e)}")
def review_and_correct(self, write_changes: bool = False) -> List[Dict]:
"""Main review and correction process"""
results = self.load_results()
modified_results = []
with Progress() as progress:
task = progress.add_task("[cyan]Reviewing...", total=len(results))
for result in results:
progress.update(task, advance=1)
if result.get('status') != 'completed':
self.logger.warning(f"Skipping incomplete result: {result.get('pdf_path')}")
modified_results.append(result)
continue
self.console.clear()
self.console.print(Panel(f"Reviewing: {Path(result['pdf_path']).name}"))
# Show preview if possible
self.show_preview(result['pdf_path'])
# Display current metadata
self.display_metadata(result)
# Ask if metadata needs correction
if Confirm.ask("\nDo you want to edit this metadata?"):
result['metadata'] = self.edit_metadata(result['metadata'])
result['metadata']['manually_reviewed'] = True
# Show updated metadata
self.console.print("\n[green]Updated metadata:[/green]")
self.display_metadata(result)
modified_results.append(result)
if Confirm.ask("\nContinue to next file?", default=True) is False:
break
# Save modified results
output_file = self.results_file.parent / f"reviewed_{self.results_file.name}"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(modified_results, f, indent=4, ensure_ascii=False)
self.console.print(f"\n[green]Saved reviewed results to: {output_file}[/green]")
# Write changes if requested
if write_changes:
writer = PDFMetadataWriter(self.logger)
stats = writer.batch_write_metadata(modified_results)
self.console.print("\n[yellow]Metadata Writing Results:[/yellow]")
self.console.print(f"Successfully updated: {stats['success_count']} files")
self.console.print(f"Failed to update: {stats['failure_count']} files")
return modified_results
def main():
parser = argparse.ArgumentParser(
description='Review and correct extracted PDF metadata'
)
parser.add_argument('results_file',
help='JSON file containing extraction results')
parser.add_argument('--write',
action='store_true',
help='Write corrected metadata back to PDFs')
parser.add_argument('--debug',
action='store_true',
help='Enable debug logging')
args = parser.parse_args()
# Setup logging
logging.basicConfig(
level=logging.DEBUG if args.debug else logging.INFO,
format='%(message)s'
)
logger = logging.getLogger('MetadataReviewer')
# Run review process
reviewer = MetadataReviewer(args.results_file, logger)
reviewer.review_and_correct(write_changes=args.write)
if __name__ == "__main__":
main()