Spaces:

Alteredverse
/

open-catalog-parser

Build error

File size: 3,922 Bytes

a639170

import os
import json
import time
import logging
from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict

from mineru import Mineru, Layout, Table
from sentence_transformers import SentenceTransformer
from llama_cpp import Llama
from fastapi.encoders import jsonable_encoder

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ProductSpec:
    name: str
    description: Optional[str] = None
    price: Optional[float] = None
    attributes: Dict[str, str] = None
    tables: List[Dict] = None

    def to_dict(self):
        return jsonable_encoder(self)

class PDFProcessor:
    def __init__(self):
        self.mineru = Mineru()
        self.emb_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Initialize quantized LLM (using deepseek-1.3b)
        self.llm = Llama(
            model_path="models/deepseek-1.3b-q5_k_m.gguf",
            n_ctx=2048,
            n_threads=os.cpu_count() - 1,
            n_gpu_layers=35 if os.getenv('USE_GPU') else 0
        )
        
    def extract_layout(self, pdf_path: str) -> List[Layout]:
        """Extract structured layout using MinerU"""
        return self.mineru.process_pdf(pdf_path)

    def process_tables(self, tables: List[Table]) -> List[Dict]:
        """Convert MinerU tables to structured format"""
        return [{
            "page": table.page_number,
            "cells": table.cells,
            "header": table.headers,
            "content": table.content
        } for table in tables]

    def generate_query_prompt(self, text: str) -> str:
        """Create optimized extraction prompt"""
        return f"""Extract product specifications from this text:
{text}

Return JSON format:
{{
    "name": "product name",
    "description": "product description",
    "price": numeric_price,
    "attributes": {{ "key": "value" }}
}}"""

    def parse_response(self, response: str) -> Optional[ProductSpec]:
        """Robust JSON parsing with fallbacks"""
        try:
            json_start = response.find('{')
            json_end = response.rfind('}') + 1
            data = json.loads(response[json_start:json_end])
            return ProductSpec(
                name=data.get('name', ''),
                description=data.get('description'),
                price=data.get('price'),
                attributes=data.get('attributes', {})
            )
        except (json.JSONDecodeError, KeyError) as e:
            logger.warning(f"Parse error: {e}")
            return None

    def process_pdf(self, pdf_path: str) -> Dict:
        """Main processing pipeline"""
        start_time = time.time()
        
        # Extract structured content
        layout = self.extract_layout(pdf_path)
        tables = self.process_tables(layout.tables)
        
        # Process text blocks
        products = []
        for block in layout.text_blocks:
            prompt = self.generate_query_prompt(block.text)
            
            # Generate response with hardware optimization
            response = self.llm.create_chat_completion(
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
                max_tokens=512
            )
            
            if product := self.parse_response(response['choices'][0]['message']['content']):
                product.tables = tables
                products.append(product.to_dict())
        
        logger.info(f"Processed {len(products)} products in {time.time()-start_time:.2f}s")
        return {"products": products, "tables": tables}

def process_pdf_catalog(pdf_path: str):
    processor = PDFProcessor()
    try:
        result = processor.process_pdf(pdf_path)
        return result, "Processing completed successfully!"
    except Exception as e:
        logger.error(f"Processing failed: {e}")
        return {}, "Error processing PDF"