metadata_reviewer.py hinzugefügt
This commit is contained in:
parent
09e0f8e39b
commit
234ebcb0b7
1 changed files with 189 additions and 0 deletions
189
metadata_reviewer.py
Normal file
189
metadata_reviewer.py
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from typing import Dict, List
|
||||||
|
import logging
|
||||||
|
from metadata_writer import PDFMetadataWriter
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.prompt import Prompt, Confirm
|
||||||
|
from rich.panel import Panel
|
||||||
|
import tempfile
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
import os
|
||||||
|
from rich.progress import Progress
|
||||||
|
|
||||||
|
class MetadataReviewer:
|
||||||
|
def __init__(self, results_file: str, logger: logging.Logger = None):
|
||||||
|
self.results_file = Path(results_file)
|
||||||
|
self.logger = logger or logging.getLogger('MetadataReviewer')
|
||||||
|
self.console = Console()
|
||||||
|
self.temp_dir = Path(tempfile.mkdtemp())
|
||||||
|
|
||||||
|
def load_results(self) -> List[Dict]:
|
||||||
|
"""Load results from JSON file"""
|
||||||
|
try:
|
||||||
|
with open(self.results_file, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error loading results file: {str(e)}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def display_metadata(self, result: Dict):
|
||||||
|
"""Display metadata in a formatted table"""
|
||||||
|
metadata = result.get('metadata', {})
|
||||||
|
|
||||||
|
table = Table(title=f"Metadata for: {Path(result['pdf_path']).name}")
|
||||||
|
table.add_column("Field", style="cyan")
|
||||||
|
table.add_column("Value", style="yellow")
|
||||||
|
table.add_column("Confidence", style="green")
|
||||||
|
|
||||||
|
fields = ['title', 'date', 'publisher', 'issue_number']
|
||||||
|
for field in fields:
|
||||||
|
table.add_row(
|
||||||
|
field.replace('_', ' ').title(),
|
||||||
|
str(metadata.get(field, 'N/A')),
|
||||||
|
metadata.get('confidence', 'unknown')
|
||||||
|
)
|
||||||
|
|
||||||
|
self.console.print(table)
|
||||||
|
|
||||||
|
def edit_metadata(self, metadata: Dict) -> Dict:
|
||||||
|
"""Interactively edit metadata"""
|
||||||
|
new_metadata = metadata.copy()
|
||||||
|
|
||||||
|
self.console.print("\n[yellow]Edit metadata (press Enter to keep current value):[/yellow]")
|
||||||
|
fields = ['title', 'date', 'publisher', 'issue_number']
|
||||||
|
|
||||||
|
for field in fields:
|
||||||
|
current_value = metadata.get(field, '')
|
||||||
|
new_value = Prompt.ask(
|
||||||
|
f"{field.replace('_', ' ').title()}",
|
||||||
|
default=str(current_value) if current_value else ''
|
||||||
|
)
|
||||||
|
new_metadata[field] = new_value if new_value else current_value
|
||||||
|
|
||||||
|
# Allow confidence adjustment
|
||||||
|
confidence_options = ['high', 'medium', 'low']
|
||||||
|
current_confidence = metadata.get('confidence', 'medium')
|
||||||
|
new_confidence = Prompt.ask(
|
||||||
|
"Confidence",
|
||||||
|
choices=confidence_options,
|
||||||
|
default=current_confidence
|
||||||
|
)
|
||||||
|
new_metadata['confidence'] = new_confidence
|
||||||
|
|
||||||
|
return new_metadata
|
||||||
|
|
||||||
|
def show_preview(self, pdf_path: str):
|
||||||
|
"""Show first page preview if possible"""
|
||||||
|
try:
|
||||||
|
# Convert first page to image
|
||||||
|
images = convert_from_path(pdf_path, first_page=1, last_page=1)
|
||||||
|
if images:
|
||||||
|
preview_path = self.temp_dir / f"preview_{Path(pdf_path).stem}.jpg"
|
||||||
|
images[0].save(str(preview_path))
|
||||||
|
|
||||||
|
# If running in compatible terminal, show image path
|
||||||
|
self.console.print(f"\n[blue]Preview saved to: {preview_path}[/blue]")
|
||||||
|
|
||||||
|
# On compatible systems, try to open the image
|
||||||
|
if sys.platform == 'darwin': # macOS
|
||||||
|
os.system(f'open "{preview_path}"')
|
||||||
|
elif sys.platform == 'linux':
|
||||||
|
os.system(f'xdg-open "{preview_path}"')
|
||||||
|
elif sys.platform == 'win32':
|
||||||
|
os.system(f'start "" "{preview_path}"')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"Could not create preview: {str(e)}")
|
||||||
|
|
||||||
|
def review_and_correct(self, write_changes: bool = False) -> List[Dict]:
|
||||||
|
"""Main review and correction process"""
|
||||||
|
results = self.load_results()
|
||||||
|
modified_results = []
|
||||||
|
|
||||||
|
with Progress() as progress:
|
||||||
|
task = progress.add_task("[cyan]Reviewing...", total=len(results))
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|
||||||
|
if result.get('status') != 'completed':
|
||||||
|
self.logger.warning(f"Skipping incomplete result: {result.get('pdf_path')}")
|
||||||
|
modified_results.append(result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.console.clear()
|
||||||
|
self.console.print(Panel(f"Reviewing: {Path(result['pdf_path']).name}"))
|
||||||
|
|
||||||
|
# Show preview if possible
|
||||||
|
self.show_preview(result['pdf_path'])
|
||||||
|
|
||||||
|
# Display current metadata
|
||||||
|
self.display_metadata(result)
|
||||||
|
|
||||||
|
# Ask if metadata needs correction
|
||||||
|
if Confirm.ask("\nDo you want to edit this metadata?"):
|
||||||
|
result['metadata'] = self.edit_metadata(result['metadata'])
|
||||||
|
result['metadata']['manually_reviewed'] = True
|
||||||
|
|
||||||
|
# Show updated metadata
|
||||||
|
self.console.print("\n[green]Updated metadata:[/green]")
|
||||||
|
self.display_metadata(result)
|
||||||
|
|
||||||
|
modified_results.append(result)
|
||||||
|
|
||||||
|
if Confirm.ask("\nContinue to next file?", default=True) is False:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Save modified results
|
||||||
|
output_file = self.results_file.parent / f"reviewed_{self.results_file.name}"
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(modified_results, f, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
|
self.console.print(f"\n[green]Saved reviewed results to: {output_file}[/green]")
|
||||||
|
|
||||||
|
# Write changes if requested
|
||||||
|
if write_changes:
|
||||||
|
writer = PDFMetadataWriter(self.logger)
|
||||||
|
stats = writer.batch_write_metadata(modified_results)
|
||||||
|
|
||||||
|
self.console.print("\n[yellow]Metadata Writing Results:[/yellow]")
|
||||||
|
self.console.print(f"Successfully updated: {stats['success_count']} files")
|
||||||
|
self.console.print(f"Failed to update: {stats['failure_count']} files")
|
||||||
|
|
||||||
|
return modified_results
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Review and correct extracted PDF metadata'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument('results_file',
|
||||||
|
help='JSON file containing extraction results')
|
||||||
|
|
||||||
|
parser.add_argument('--write',
|
||||||
|
action='store_true',
|
||||||
|
help='Write corrected metadata back to PDFs')
|
||||||
|
|
||||||
|
parser.add_argument('--debug',
|
||||||
|
action='store_true',
|
||||||
|
help='Enable debug logging')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.DEBUG if args.debug else logging.INFO,
|
||||||
|
format='%(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger('MetadataReviewer')
|
||||||
|
|
||||||
|
# Run review process
|
||||||
|
reviewer = MetadataReviewer(args.results_file, logger)
|
||||||
|
reviewer.review_and_correct(write_changes=args.write)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Add table
Reference in a new issue