73 lines
No EOL
2.3 KiB
Python
73 lines
No EOL
2.3 KiB
Python
from pdf2image import convert_from_path
|
|
import os
|
|
import json
|
|
from pathlib import Path
|
|
import tempfile
|
|
|
|
class PDFProcessor:
|
|
def __init__(self, input_dir, output_dir):
|
|
self.input_dir = Path(input_dir)
|
|
self.output_dir = Path(output_dir)
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create temp directory for images
|
|
self.temp_dir = Path(tempfile.mkdtemp())
|
|
|
|
def process_pdfs(self):
|
|
"""Process all PDFs in the input directory"""
|
|
pdf_files = list(self.input_dir.glob('*.pdf'))
|
|
results = []
|
|
|
|
for pdf_path in pdf_files:
|
|
try:
|
|
result = self.process_single_pdf(pdf_path)
|
|
results.append(result)
|
|
except Exception as e:
|
|
print(f"Error processing {pdf_path}: {str(e)}")
|
|
results.append({
|
|
'pdf_path': str(pdf_path),
|
|
'status': 'error',
|
|
'error': str(e)
|
|
})
|
|
|
|
# Save results to JSON
|
|
with open(self.output_dir / 'processing_results.json', 'w') as f:
|
|
json.dump(results, f, indent=4)
|
|
|
|
return results
|
|
|
|
def process_single_pdf(self, pdf_path):
|
|
"""Process a single PDF file"""
|
|
print(f"Processing: {pdf_path}")
|
|
|
|
# Convert first page to image
|
|
images = convert_from_path(pdf_path, first_page=1, last_page=1)
|
|
if not images:
|
|
raise Exception("Could not extract first page")
|
|
|
|
# Save first page image
|
|
first_page = images[0]
|
|
image_path = self.temp_dir / f"{pdf_path.stem}_page1.jpg"
|
|
first_page.save(str(image_path), 'JPEG')
|
|
|
|
# TODO: This is where we'll integrate with the vision analysis
|
|
# For now, we'll just return the paths
|
|
return {
|
|
'pdf_path': str(pdf_path),
|
|
'image_path': str(image_path),
|
|
'status': 'image_extracted'
|
|
}
|
|
|
|
def main():
|
|
# Example usage
|
|
input_dir = "path/to/pdfs"
|
|
output_dir = "path/to/output"
|
|
|
|
processor = PDFProcessor(input_dir, output_dir)
|
|
results = processor.process_pdfs()
|
|
|
|
print(f"\nProcessed {len(results)} PDF files")
|
|
print(f"Results saved to: {processor.output_dir}/processing_results.json")
|
|
|
|
if __name__ == "__main__":
|
|
main() |