metadata_reviewer.py hinzugefügt
This commit is contained in:
parent
09e0f8e39b
commit
234ebcb0b7
1 changed files with 189 additions and 0 deletions
189
metadata_reviewer.py
Normal file
189
metadata_reviewer.py
Normal file
|
@ -0,0 +1,189 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import sys
|
||||
from typing import Dict, List
|
||||
import logging
|
||||
from metadata_writer import PDFMetadataWriter
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.prompt import Prompt, Confirm
|
||||
from rich.panel import Panel
|
||||
import tempfile
|
||||
from pdf2image import convert_from_path
|
||||
import os
|
||||
from rich.progress import Progress
|
||||
|
||||
class MetadataReviewer:
|
||||
def __init__(self, results_file: str, logger: logging.Logger = None):
|
||||
self.results_file = Path(results_file)
|
||||
self.logger = logger or logging.getLogger('MetadataReviewer')
|
||||
self.console = Console()
|
||||
self.temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
def load_results(self) -> List[Dict]:
|
||||
"""Load results from JSON file"""
|
||||
try:
|
||||
with open(self.results_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error loading results file: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
def display_metadata(self, result: Dict):
|
||||
"""Display metadata in a formatted table"""
|
||||
metadata = result.get('metadata', {})
|
||||
|
||||
table = Table(title=f"Metadata for: {Path(result['pdf_path']).name}")
|
||||
table.add_column("Field", style="cyan")
|
||||
table.add_column("Value", style="yellow")
|
||||
table.add_column("Confidence", style="green")
|
||||
|
||||
fields = ['title', 'date', 'publisher', 'issue_number']
|
||||
for field in fields:
|
||||
table.add_row(
|
||||
field.replace('_', ' ').title(),
|
||||
str(metadata.get(field, 'N/A')),
|
||||
metadata.get('confidence', 'unknown')
|
||||
)
|
||||
|
||||
self.console.print(table)
|
||||
|
||||
def edit_metadata(self, metadata: Dict) -> Dict:
|
||||
"""Interactively edit metadata"""
|
||||
new_metadata = metadata.copy()
|
||||
|
||||
self.console.print("\n[yellow]Edit metadata (press Enter to keep current value):[/yellow]")
|
||||
fields = ['title', 'date', 'publisher', 'issue_number']
|
||||
|
||||
for field in fields:
|
||||
current_value = metadata.get(field, '')
|
||||
new_value = Prompt.ask(
|
||||
f"{field.replace('_', ' ').title()}",
|
||||
default=str(current_value) if current_value else ''
|
||||
)
|
||||
new_metadata[field] = new_value if new_value else current_value
|
||||
|
||||
# Allow confidence adjustment
|
||||
confidence_options = ['high', 'medium', 'low']
|
||||
current_confidence = metadata.get('confidence', 'medium')
|
||||
new_confidence = Prompt.ask(
|
||||
"Confidence",
|
||||
choices=confidence_options,
|
||||
default=current_confidence
|
||||
)
|
||||
new_metadata['confidence'] = new_confidence
|
||||
|
||||
return new_metadata
|
||||
|
||||
def show_preview(self, pdf_path: str):
|
||||
"""Show first page preview if possible"""
|
||||
try:
|
||||
# Convert first page to image
|
||||
images = convert_from_path(pdf_path, first_page=1, last_page=1)
|
||||
if images:
|
||||
preview_path = self.temp_dir / f"preview_{Path(pdf_path).stem}.jpg"
|
||||
images[0].save(str(preview_path))
|
||||
|
||||
# If running in compatible terminal, show image path
|
||||
self.console.print(f"\n[blue]Preview saved to: {preview_path}[/blue]")
|
||||
|
||||
# On compatible systems, try to open the image
|
||||
if sys.platform == 'darwin': # macOS
|
||||
os.system(f'open "{preview_path}"')
|
||||
elif sys.platform == 'linux':
|
||||
os.system(f'xdg-open "{preview_path}"')
|
||||
elif sys.platform == 'win32':
|
||||
os.system(f'start "" "{preview_path}"')
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not create preview: {str(e)}")
|
||||
|
||||
def review_and_correct(self, write_changes: bool = False) -> List[Dict]:
|
||||
"""Main review and correction process"""
|
||||
results = self.load_results()
|
||||
modified_results = []
|
||||
|
||||
with Progress() as progress:
|
||||
task = progress.add_task("[cyan]Reviewing...", total=len(results))
|
||||
|
||||
for result in results:
|
||||
progress.update(task, advance=1)
|
||||
|
||||
if result.get('status') != 'completed':
|
||||
self.logger.warning(f"Skipping incomplete result: {result.get('pdf_path')}")
|
||||
modified_results.append(result)
|
||||
continue
|
||||
|
||||
self.console.clear()
|
||||
self.console.print(Panel(f"Reviewing: {Path(result['pdf_path']).name}"))
|
||||
|
||||
# Show preview if possible
|
||||
self.show_preview(result['pdf_path'])
|
||||
|
||||
# Display current metadata
|
||||
self.display_metadata(result)
|
||||
|
||||
# Ask if metadata needs correction
|
||||
if Confirm.ask("\nDo you want to edit this metadata?"):
|
||||
result['metadata'] = self.edit_metadata(result['metadata'])
|
||||
result['metadata']['manually_reviewed'] = True
|
||||
|
||||
# Show updated metadata
|
||||
self.console.print("\n[green]Updated metadata:[/green]")
|
||||
self.display_metadata(result)
|
||||
|
||||
modified_results.append(result)
|
||||
|
||||
if Confirm.ask("\nContinue to next file?", default=True) is False:
|
||||
break
|
||||
|
||||
# Save modified results
|
||||
output_file = self.results_file.parent / f"reviewed_{self.results_file.name}"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(modified_results, f, indent=4, ensure_ascii=False)
|
||||
|
||||
self.console.print(f"\n[green]Saved reviewed results to: {output_file}[/green]")
|
||||
|
||||
# Write changes if requested
|
||||
if write_changes:
|
||||
writer = PDFMetadataWriter(self.logger)
|
||||
stats = writer.batch_write_metadata(modified_results)
|
||||
|
||||
self.console.print("\n[yellow]Metadata Writing Results:[/yellow]")
|
||||
self.console.print(f"Successfully updated: {stats['success_count']} files")
|
||||
self.console.print(f"Failed to update: {stats['failure_count']} files")
|
||||
|
||||
return modified_results
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Review and correct extracted PDF metadata'
|
||||
)
|
||||
|
||||
parser.add_argument('results_file',
|
||||
help='JSON file containing extraction results')
|
||||
|
||||
parser.add_argument('--write',
|
||||
action='store_true',
|
||||
help='Write corrected metadata back to PDFs')
|
||||
|
||||
parser.add_argument('--debug',
|
||||
action='store_true',
|
||||
help='Enable debug logging')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.debug else logging.INFO,
|
||||
format='%(message)s'
|
||||
)
|
||||
logger = logging.getLogger('MetadataReviewer')
|
||||
|
||||
# Run review process
|
||||
reviewer = MetadataReviewer(args.results_file, logger)
|
||||
reviewer.review_and_correct(write_changes=args.write)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Reference in a new issue