import os import json import time import logging from pathlib import Path from typing import List, Dict, Optional from dataclasses import dataclass, asdict from mineru import Mineru, Layout, Table from sentence_transformers import SentenceTransformer from llama_cpp import Llama from fastapi.encoders import jsonable_encoder logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class ProductSpec: name: str description: Optional[str] = None price: Optional[float] = None attributes: Dict[str, str] = None tables: List[Dict] = None def to_dict(self): return jsonable_encoder(self) class PDFProcessor: def __init__(self): self.mineru = Mineru() self.emb_model = SentenceTransformer('all-MiniLM-L6-v2') # Initialize quantized LLM (using deepseek-1.3b) self.llm = Llama( model_path="models/deepseek-1.3b-q5_k_m.gguf", n_ctx=2048, n_threads=os.cpu_count() - 1, n_gpu_layers=35 if os.getenv('USE_GPU') else 0 ) def extract_layout(self, pdf_path: str) -> List[Layout]: """Extract structured layout using MinerU""" return self.mineru.process_pdf(pdf_path) def process_tables(self, tables: List[Table]) -> List[Dict]: """Convert MinerU tables to structured format""" return [{ "page": table.page_number, "cells": table.cells, "header": table.headers, "content": table.content } for table in tables] def generate_query_prompt(self, text: str) -> str: """Create optimized extraction prompt""" return f"""Extract product specifications from this text: {text} Return JSON format: {{ "name": "product name", "description": "product description", "price": numeric_price, "attributes": {{ "key": "value" }} }}""" def parse_response(self, response: str) -> Optional[ProductSpec]: """Robust JSON parsing with fallbacks""" try: json_start = response.find('{') json_end = response.rfind('}') + 1 data = json.loads(response[json_start:json_end]) return ProductSpec( name=data.get('name', ''), description=data.get('description'), price=data.get('price'), attributes=data.get('attributes', {}) ) except (json.JSONDecodeError, KeyError) as e: logger.warning(f"Parse error: {e}") return None def process_pdf(self, pdf_path: str) -> Dict: """Main processing pipeline""" start_time = time.time() # Extract structured content layout = self.extract_layout(pdf_path) tables = self.process_tables(layout.tables) # Process text blocks products = [] for block in layout.text_blocks: prompt = self.generate_query_prompt(block.text) # Generate response with hardware optimization response = self.llm.create_chat_completion( messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=512 ) if product := self.parse_response(response['choices'][0]['message']['content']): product.tables = tables products.append(product.to_dict()) logger.info(f"Processed {len(products)} products in {time.time()-start_time:.2f}s") return {"products": products, "tables": tables} def process_pdf_catalog(pdf_path: str): processor = PDFProcessor() try: result = processor.process_pdf(pdf_path) return result, "Processing completed successfully!" except Exception as e: logger.error(f"Processing failed: {e}") return {}, "Error processing PDF"