File size: 3,922 Bytes
a639170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import json
import time
import logging
from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict

from mineru import Mineru, Layout, Table
from sentence_transformers import SentenceTransformer
from llama_cpp import Llama
from fastapi.encoders import jsonable_encoder

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ProductSpec:
    name: str
    description: Optional[str] = None
    price: Optional[float] = None
    attributes: Dict[str, str] = None
    tables: List[Dict] = None

    def to_dict(self):
        return jsonable_encoder(self)

class PDFProcessor:
    def __init__(self):
        self.mineru = Mineru()
        self.emb_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Initialize quantized LLM (using deepseek-1.3b)
        self.llm = Llama(
            model_path="models/deepseek-1.3b-q5_k_m.gguf",
            n_ctx=2048,
            n_threads=os.cpu_count() - 1,
            n_gpu_layers=35 if os.getenv('USE_GPU') else 0
        )
        
    def extract_layout(self, pdf_path: str) -> List[Layout]:
        """Extract structured layout using MinerU"""
        return self.mineru.process_pdf(pdf_path)

    def process_tables(self, tables: List[Table]) -> List[Dict]:
        """Convert MinerU tables to structured format"""
        return [{
            "page": table.page_number,
            "cells": table.cells,
            "header": table.headers,
            "content": table.content
        } for table in tables]

    def generate_query_prompt(self, text: str) -> str:
        """Create optimized extraction prompt"""
        return f"""Extract product specifications from this text:
{text}

Return JSON format:
{{
    "name": "product name",
    "description": "product description",
    "price": numeric_price,
    "attributes": {{ "key": "value" }}
}}"""

    def parse_response(self, response: str) -> Optional[ProductSpec]:
        """Robust JSON parsing with fallbacks"""
        try:
            json_start = response.find('{')
            json_end = response.rfind('}') + 1
            data = json.loads(response[json_start:json_end])
            return ProductSpec(
                name=data.get('name', ''),
                description=data.get('description'),
                price=data.get('price'),
                attributes=data.get('attributes', {})
            )
        except (json.JSONDecodeError, KeyError) as e:
            logger.warning(f"Parse error: {e}")
            return None

    def process_pdf(self, pdf_path: str) -> Dict:
        """Main processing pipeline"""
        start_time = time.time()
        
        # Extract structured content
        layout = self.extract_layout(pdf_path)
        tables = self.process_tables(layout.tables)
        
        # Process text blocks
        products = []
        for block in layout.text_blocks:
            prompt = self.generate_query_prompt(block.text)
            
            # Generate response with hardware optimization
            response = self.llm.create_chat_completion(
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
                max_tokens=512
            )
            
            if product := self.parse_response(response['choices'][0]['message']['content']):
                product.tables = tables
                products.append(product.to_dict())
        
        logger.info(f"Processed {len(products)} products in {time.time()-start_time:.2f}s")
        return {"products": products, "tables": tables}

def process_pdf_catalog(pdf_path: str):
    processor = PDFProcessor()
    try:
        result = processor.process_pdf(pdf_path)
        return result, "Processing completed successfully!"
    except Exception as e:
        logger.error(f"Processing failed: {e}")
        return {}, "Error processing PDF"