From 234ebcb0b70cd19c8e09b87c3965ed77916953ef Mon Sep 17 00:00:00 2001 From: sebastian Date: Wed, 19 Feb 2025 21:58:18 +0000 Subject: [PATCH] =?UTF-8?q?metadata=5Freviewer.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- metadata_reviewer.py | 189 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 metadata_reviewer.py diff --git a/metadata_reviewer.py b/metadata_reviewer.py new file mode 100644 index 0000000..ef261e9 --- /dev/null +++ b/metadata_reviewer.py @@ -0,0 +1,189 @@ +import json +from pathlib import Path +import argparse +import sys +from typing import Dict, List +import logging +from metadata_writer import PDFMetadataWriter +from rich.console import Console +from rich.table import Table +from rich.prompt import Prompt, Confirm +from rich.panel import Panel +import tempfile +from pdf2image import convert_from_path +import os +from rich.progress import Progress + +class MetadataReviewer: + def __init__(self, results_file: str, logger: logging.Logger = None): + self.results_file = Path(results_file) + self.logger = logger or logging.getLogger('MetadataReviewer') + self.console = Console() + self.temp_dir = Path(tempfile.mkdtemp()) + + def load_results(self) -> List[Dict]: + """Load results from JSON file""" + try: + with open(self.results_file, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + self.logger.error(f"Error loading results file: {str(e)}") + sys.exit(1) + + def display_metadata(self, result: Dict): + """Display metadata in a formatted table""" + metadata = result.get('metadata', {}) + + table = Table(title=f"Metadata for: {Path(result['pdf_path']).name}") + table.add_column("Field", style="cyan") + table.add_column("Value", style="yellow") + table.add_column("Confidence", style="green") + + fields = ['title', 'date', 'publisher', 'issue_number'] + for field in fields: + table.add_row( + field.replace('_', ' ').title(), + str(metadata.get(field, 'N/A')), + metadata.get('confidence', 'unknown') + ) + + self.console.print(table) + + def edit_metadata(self, metadata: Dict) -> Dict: + """Interactively edit metadata""" + new_metadata = metadata.copy() + + self.console.print("\n[yellow]Edit metadata (press Enter to keep current value):[/yellow]") + fields = ['title', 'date', 'publisher', 'issue_number'] + + for field in fields: + current_value = metadata.get(field, '') + new_value = Prompt.ask( + f"{field.replace('_', ' ').title()}", + default=str(current_value) if current_value else '' + ) + new_metadata[field] = new_value if new_value else current_value + + # Allow confidence adjustment + confidence_options = ['high', 'medium', 'low'] + current_confidence = metadata.get('confidence', 'medium') + new_confidence = Prompt.ask( + "Confidence", + choices=confidence_options, + default=current_confidence + ) + new_metadata['confidence'] = new_confidence + + return new_metadata + + def show_preview(self, pdf_path: str): + """Show first page preview if possible""" + try: + # Convert first page to image + images = convert_from_path(pdf_path, first_page=1, last_page=1) + if images: + preview_path = self.temp_dir / f"preview_{Path(pdf_path).stem}.jpg" + images[0].save(str(preview_path)) + + # If running in compatible terminal, show image path + self.console.print(f"\n[blue]Preview saved to: {preview_path}[/blue]") + + # On compatible systems, try to open the image + if sys.platform == 'darwin': # macOS + os.system(f'open "{preview_path}"') + elif sys.platform == 'linux': + os.system(f'xdg-open "{preview_path}"') + elif sys.platform == 'win32': + os.system(f'start "" "{preview_path}"') + + except Exception as e: + self.logger.debug(f"Could not create preview: {str(e)}") + + def review_and_correct(self, write_changes: bool = False) -> List[Dict]: + """Main review and correction process""" + results = self.load_results() + modified_results = [] + + with Progress() as progress: + task = progress.add_task("[cyan]Reviewing...", total=len(results)) + + for result in results: + progress.update(task, advance=1) + + if result.get('status') != 'completed': + self.logger.warning(f"Skipping incomplete result: {result.get('pdf_path')}") + modified_results.append(result) + continue + + self.console.clear() + self.console.print(Panel(f"Reviewing: {Path(result['pdf_path']).name}")) + + # Show preview if possible + self.show_preview(result['pdf_path']) + + # Display current metadata + self.display_metadata(result) + + # Ask if metadata needs correction + if Confirm.ask("\nDo you want to edit this metadata?"): + result['metadata'] = self.edit_metadata(result['metadata']) + result['metadata']['manually_reviewed'] = True + + # Show updated metadata + self.console.print("\n[green]Updated metadata:[/green]") + self.display_metadata(result) + + modified_results.append(result) + + if Confirm.ask("\nContinue to next file?", default=True) is False: + break + + # Save modified results + output_file = self.results_file.parent / f"reviewed_{self.results_file.name}" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(modified_results, f, indent=4, ensure_ascii=False) + + self.console.print(f"\n[green]Saved reviewed results to: {output_file}[/green]") + + # Write changes if requested + if write_changes: + writer = PDFMetadataWriter(self.logger) + stats = writer.batch_write_metadata(modified_results) + + self.console.print("\n[yellow]Metadata Writing Results:[/yellow]") + self.console.print(f"Successfully updated: {stats['success_count']} files") + self.console.print(f"Failed to update: {stats['failure_count']} files") + + return modified_results + +def main(): + parser = argparse.ArgumentParser( + description='Review and correct extracted PDF metadata' + ) + + parser.add_argument('results_file', + help='JSON file containing extraction results') + + parser.add_argument('--write', + action='store_true', + help='Write corrected metadata back to PDFs') + + parser.add_argument('--debug', + action='store_true', + help='Enable debug logging') + + args = parser.parse_args() + + # Setup logging + logging.basicConfig( + level=logging.DEBUG if args.debug else logging.INFO, + format='%(message)s' + ) + logger = logging.getLogger('MetadataReviewer') + + # Run review process + reviewer = MetadataReviewer(args.results_file, logger) + reviewer.review_and_correct(write_changes=args.write) + +if __name__ == "__main__": + main() \ No newline at end of file