minar09 commited on
Commit
a639170
·
verified ·
1 Parent(s): 43355d2

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +117 -0
main.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import List, Dict, Optional
7
+ from dataclasses import dataclass, asdict
8
+
9
+ from mineru import Mineru, Layout, Table
10
+ from sentence_transformers import SentenceTransformer
11
+ from llama_cpp import Llama
12
+ from fastapi.encoders import jsonable_encoder
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ @dataclass
18
+ class ProductSpec:
19
+ name: str
20
+ description: Optional[str] = None
21
+ price: Optional[float] = None
22
+ attributes: Dict[str, str] = None
23
+ tables: List[Dict] = None
24
+
25
+ def to_dict(self):
26
+ return jsonable_encoder(self)
27
+
28
+ class PDFProcessor:
29
+ def __init__(self):
30
+ self.mineru = Mineru()
31
+ self.emb_model = SentenceTransformer('all-MiniLM-L6-v2')
32
+
33
+ # Initialize quantized LLM (using deepseek-1.3b)
34
+ self.llm = Llama(
35
+ model_path="models/deepseek-1.3b-q5_k_m.gguf",
36
+ n_ctx=2048,
37
+ n_threads=os.cpu_count() - 1,
38
+ n_gpu_layers=35 if os.getenv('USE_GPU') else 0
39
+ )
40
+
41
+ def extract_layout(self, pdf_path: str) -> List[Layout]:
42
+ """Extract structured layout using MinerU"""
43
+ return self.mineru.process_pdf(pdf_path)
44
+
45
+ def process_tables(self, tables: List[Table]) -> List[Dict]:
46
+ """Convert MinerU tables to structured format"""
47
+ return [{
48
+ "page": table.page_number,
49
+ "cells": table.cells,
50
+ "header": table.headers,
51
+ "content": table.content
52
+ } for table in tables]
53
+
54
+ def generate_query_prompt(self, text: str) -> str:
55
+ """Create optimized extraction prompt"""
56
+ return f"""Extract product specifications from this text:
57
+ {text}
58
+
59
+ Return JSON format:
60
+ {{
61
+ "name": "product name",
62
+ "description": "product description",
63
+ "price": numeric_price,
64
+ "attributes": {{ "key": "value" }}
65
+ }}"""
66
+
67
+ def parse_response(self, response: str) -> Optional[ProductSpec]:
68
+ """Robust JSON parsing with fallbacks"""
69
+ try:
70
+ json_start = response.find('{')
71
+ json_end = response.rfind('}') + 1
72
+ data = json.loads(response[json_start:json_end])
73
+ return ProductSpec(
74
+ name=data.get('name', ''),
75
+ description=data.get('description'),
76
+ price=data.get('price'),
77
+ attributes=data.get('attributes', {})
78
+ )
79
+ except (json.JSONDecodeError, KeyError) as e:
80
+ logger.warning(f"Parse error: {e}")
81
+ return None
82
+
83
+ def process_pdf(self, pdf_path: str) -> Dict:
84
+ """Main processing pipeline"""
85
+ start_time = time.time()
86
+
87
+ # Extract structured content
88
+ layout = self.extract_layout(pdf_path)
89
+ tables = self.process_tables(layout.tables)
90
+
91
+ # Process text blocks
92
+ products = []
93
+ for block in layout.text_blocks:
94
+ prompt = self.generate_query_prompt(block.text)
95
+
96
+ # Generate response with hardware optimization
97
+ response = self.llm.create_chat_completion(
98
+ messages=[{"role": "user", "content": prompt}],
99
+ temperature=0.1,
100
+ max_tokens=512
101
+ )
102
+
103
+ if product := self.parse_response(response['choices'][0]['message']['content']):
104
+ product.tables = tables
105
+ products.append(product.to_dict())
106
+
107
+ logger.info(f"Processed {len(products)} products in {time.time()-start_time:.2f}s")
108
+ return {"products": products, "tables": tables}
109
+
110
+ def process_pdf_catalog(pdf_path: str):
111
+ processor = PDFProcessor()
112
+ try:
113
+ result = processor.process_pdf(pdf_path)
114
+ return result, "Processing completed successfully!"
115
+ except Exception as e:
116
+ logger.error(f"Processing failed: {e}")
117
+ return {}, "Error processing PDF"