import json from pathlib import Path import argparse import sys from typing import Dict, List import logging from metadata_writer import PDFMetadataWriter from rich.console import Console from rich.table import Table from rich.prompt import Prompt, Confirm from rich.panel import Panel import tempfile from pdf2image import convert_from_path import os from rich.progress import Progress class MetadataReviewer: def __init__(self, results_file: str, logger: logging.Logger = None): self.results_file = Path(results_file) self.logger = logger or logging.getLogger('MetadataReviewer') self.console = Console() self.temp_dir = Path(tempfile.mkdtemp()) def load_results(self) -> List[Dict]: """Load results from JSON file""" try: with open(self.results_file, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: self.logger.error(f"Error loading results file: {str(e)}") sys.exit(1) def display_metadata(self, result: Dict): """Display metadata in a formatted table""" metadata = result.get('metadata', {}) table = Table(title=f"Metadata for: {Path(result['pdf_path']).name}") table.add_column("Field", style="cyan") table.add_column("Value", style="yellow") table.add_column("Confidence", style="green") fields = ['title', 'date', 'publisher', 'issue_number'] for field in fields: table.add_row( field.replace('_', ' ').title(), str(metadata.get(field, 'N/A')), metadata.get('confidence', 'unknown') ) self.console.print(table) def edit_metadata(self, metadata: Dict) -> Dict: """Interactively edit metadata""" new_metadata = metadata.copy() self.console.print("\n[yellow]Edit metadata (press Enter to keep current value):[/yellow]") fields = ['title', 'date', 'publisher', 'issue_number'] for field in fields: current_value = metadata.get(field, '') new_value = Prompt.ask( f"{field.replace('_', ' ').title()}", default=str(current_value) if current_value else '' ) new_metadata[field] = new_value if new_value else current_value # Allow confidence adjustment confidence_options = ['high', 'medium', 'low'] current_confidence = metadata.get('confidence', 'medium') new_confidence = Prompt.ask( "Confidence", choices=confidence_options, default=current_confidence ) new_metadata['confidence'] = new_confidence return new_metadata def show_preview(self, pdf_path: str): """Show first page preview if possible""" try: # Convert first page to image images = convert_from_path(pdf_path, first_page=1, last_page=1) if images: preview_path = self.temp_dir / f"preview_{Path(pdf_path).stem}.jpg" images[0].save(str(preview_path)) # If running in compatible terminal, show image path self.console.print(f"\n[blue]Preview saved to: {preview_path}[/blue]") # On compatible systems, try to open the image if sys.platform == 'darwin': # macOS os.system(f'open "{preview_path}"') elif sys.platform == 'linux': os.system(f'xdg-open "{preview_path}"') elif sys.platform == 'win32': os.system(f'start "" "{preview_path}"') except Exception as e: self.logger.debug(f"Could not create preview: {str(e)}") def review_and_correct(self, write_changes: bool = False) -> List[Dict]: """Main review and correction process""" results = self.load_results() modified_results = [] with Progress() as progress: task = progress.add_task("[cyan]Reviewing...", total=len(results)) for result in results: progress.update(task, advance=1) if result.get('status') != 'completed': self.logger.warning(f"Skipping incomplete result: {result.get('pdf_path')}") modified_results.append(result) continue self.console.clear() self.console.print(Panel(f"Reviewing: {Path(result['pdf_path']).name}")) # Show preview if possible self.show_preview(result['pdf_path']) # Display current metadata self.display_metadata(result) # Ask if metadata needs correction if Confirm.ask("\nDo you want to edit this metadata?"): result['metadata'] = self.edit_metadata(result['metadata']) result['metadata']['manually_reviewed'] = True # Show updated metadata self.console.print("\n[green]Updated metadata:[/green]") self.display_metadata(result) modified_results.append(result) if Confirm.ask("\nContinue to next file?", default=True) is False: break # Save modified results output_file = self.results_file.parent / f"reviewed_{self.results_file.name}" with open(output_file, 'w', encoding='utf-8') as f: json.dump(modified_results, f, indent=4, ensure_ascii=False) self.console.print(f"\n[green]Saved reviewed results to: {output_file}[/green]") # Write changes if requested if write_changes: writer = PDFMetadataWriter(self.logger) stats = writer.batch_write_metadata(modified_results) self.console.print("\n[yellow]Metadata Writing Results:[/yellow]") self.console.print(f"Successfully updated: {stats['success_count']} files") self.console.print(f"Failed to update: {stats['failure_count']} files") return modified_results def main(): parser = argparse.ArgumentParser( description='Review and correct extracted PDF metadata' ) parser.add_argument('results_file', help='JSON file containing extraction results') parser.add_argument('--write', action='store_true', help='Write corrected metadata back to PDFs') parser.add_argument('--debug', action='store_true', help='Enable debug logging') args = parser.parse_args() # Setup logging logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO, format='%(message)s' ) logger = logging.getLogger('MetadataReviewer') # Run review process reviewer = MetadataReviewer(args.results_file, logger) reviewer.review_and_correct(write_changes=args.write) if __name__ == "__main__": main()