minar09 commited on
Commit
0362a74
·
verified ·
1 Parent(s): 17345fb

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. main.py +63 -46
app.py CHANGED
@@ -41,7 +41,7 @@ demo = gr.Interface(
41
  examples=pdf_examples,
42
  title="Open Source PDF Catalog Parser",
43
  description="Efficient PDF catalog processing using fitz and OpenLLM",
44
- article="Uses MinerU for layout analysis and Llama-CPP for structured extraction"
45
  )
46
 
47
  if __name__ == "__main__":
 
41
  examples=pdf_examples,
42
  title="Open Source PDF Catalog Parser",
43
  description="Efficient PDF catalog processing using fitz and OpenLLM",
44
+ article="Uses PyMuPDF for layout analysis and Llama-CPP for structured extraction"
45
  )
46
 
47
  if __name__ == "__main__":
main.py CHANGED
@@ -5,10 +5,10 @@ import logging
5
  from pathlib import Path
6
  from typing import List, Dict, Optional
7
  from dataclasses import dataclass
 
8
  import fitz # PyMuPDF
9
  from sentence_transformers import SentenceTransformer
10
  from llama_cpp import Llama
11
- from fastapi.encoders import jsonable_encoder
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
@@ -29,34 +29,25 @@ class ProductSpec:
29
  class PDFProcessor:
30
  def __init__(self):
31
  self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
32
- # self.llm = self._initialize_llm("llama-2-7b.Q2_K.gguf")
33
  self.llm = self._initialize_llm("deepseek-llm-7b-base.Q2_K.gguf")
34
  self.output_dir = Path("./output")
35
  self.output_dir.mkdir(exist_ok=True)
36
 
37
  def _initialize_emb_model(self, model_name):
38
  try:
39
- from sentence_transformers import SentenceTransformer
40
  return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
41
- except:
42
- # Load model directly
43
  from transformers import AutoTokenizer, AutoModel
44
-
45
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/" + model_name)
46
  model = AutoModel.from_pretrained("sentence-transformers/" + model_name)
47
  return model
48
 
49
  def _initialize_llm(self, model_name):
50
  """Initialize LLM with automatic download if needed"""
51
- # model_path = os.path.join("models/", model_name)
52
- # if os.path.exists(model_path):
53
- # return Llama(
54
- # model_path=model_path,
55
- # n_ctx=1024,
56
- # n_gpu_layers=-1,
57
- # n_threads=os.cpu_count() - 1,
58
- # verbose=False
59
- # )
60
  return Llama.from_pretrained(
61
  repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
62
  filename=model_name,
@@ -67,43 +58,63 @@ class PDFProcessor:
67
  start_time = time.time()
68
 
69
  # Open PDF
70
- doc = fitz.open(pdf_path)
 
 
 
 
 
71
  text_blocks = []
72
  tables = []
73
 
74
- # Extract text and tables
75
  for page_num, page in enumerate(doc):
76
- # Extract text blocks
77
- text_blocks.extend(self._extract_text_blocks(page))
 
 
 
78
 
79
- # Extract tables
80
  tables.extend(self._extract_tables(page, page_num))
81
 
82
- # Process text blocks with LLM
83
  products = []
84
- for block in text_blocks:
 
 
85
  product = self._process_text_block(block)
86
  if product:
87
  product.tables = tables
88
- products.append(product.to_dict())
 
 
 
 
 
 
 
89
 
90
  logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
91
  return {"products": products, "tables": tables}
92
 
93
  def _extract_text_blocks(self, page) -> List[str]:
94
- """Extract text blocks from a PDF page"""
95
  blocks = []
96
  for block in page.get_text("blocks"):
97
- blocks.append(block[4]) # The text content is at index 4
 
 
 
98
  return blocks
99
 
100
  def _extract_tables(self, page, page_num: int) -> List[Dict]:
101
- """Extract tables from a PDF page"""
102
  tables = []
103
  try:
104
  tab = page.find_tables()
105
- if tab.tables:
106
- for table_idx, table in enumerate(tab.tables):
107
  table_data = table.extract()
108
  if table_data:
109
  tables.append({
@@ -113,51 +124,50 @@ class PDFProcessor:
113
  "content": table_data
114
  })
115
  except Exception as e:
116
- logger.warning(f"Error extracting tables from page {page_num}: {e}")
117
  return tables
118
 
119
  def _process_text_block(self, text: str) -> Optional[ProductSpec]:
120
- """Process text block with LLM"""
121
  prompt = self._generate_query_prompt(text)
122
-
123
  try:
124
  response = self.llm.create_chat_completion(
125
  messages=[{"role": "user", "content": prompt}],
126
  temperature=0.1,
127
  max_tokens=512
128
  )
 
 
129
  return self._parse_response(response['choices'][0]['message']['content'])
130
  except Exception as e:
131
  logger.warning(f"Error processing text block: {e}")
132
  return None
133
 
134
  def _generate_query_prompt(self, text: str) -> str:
135
- """Generate extraction prompt"""
136
- return f"""Extract product specifications from this text:
137
- {text}
138
-
139
- Return JSON format:
140
- {{
141
- "name": "product name",
142
- "description": "product description",
143
- "price": numeric_price,
144
- "attributes": {{ "key": "value" }}
145
- }}"""
146
 
147
  def _parse_response(self, response: str) -> Optional[ProductSpec]:
148
- """Parse LLM response"""
149
  try:
150
  json_start = response.find('{')
151
  json_end = response.rfind('}') + 1
152
- data = json.loads(response[json_start:json_end])
 
 
 
 
 
 
153
  return ProductSpec(
154
  name=data.get('name', ''),
155
  description=data.get('description'),
156
  price=data.get('price'),
157
  attributes=data.get('attributes', {})
158
  )
159
- except (json.JSONDecodeError, KeyError) as e:
160
- logger.warning(f"Parse error: {e}")
161
  return None
162
 
163
 
@@ -169,3 +179,10 @@ def process_pdf_catalog(pdf_path: str):
169
  except Exception as e:
170
  logger.error(f"Processing failed: {e}")
171
  return {}, "Error processing PDF"
 
 
 
 
 
 
 
 
5
  from pathlib import Path
6
  from typing import List, Dict, Optional
7
  from dataclasses import dataclass
8
+ from fastapi.encoders import jsonable_encoder
9
  import fitz # PyMuPDF
10
  from sentence_transformers import SentenceTransformer
11
  from llama_cpp import Llama
 
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
 
29
  class PDFProcessor:
30
  def __init__(self):
31
  self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
32
+ # Choose the appropriate model filename below; adjust if needed.
33
  self.llm = self._initialize_llm("deepseek-llm-7b-base.Q2_K.gguf")
34
  self.output_dir = Path("./output")
35
  self.output_dir.mkdir(exist_ok=True)
36
 
37
  def _initialize_emb_model(self, model_name):
38
  try:
39
+ # Use SentenceTransformer if available
40
  return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
41
+ except Exception as e:
42
+ logger.warning(f"SentenceTransformer failed: {e}. Falling back to transformers model.")
43
  from transformers import AutoTokenizer, AutoModel
 
44
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/" + model_name)
45
  model = AutoModel.from_pretrained("sentence-transformers/" + model_name)
46
  return model
47
 
48
  def _initialize_llm(self, model_name):
49
  """Initialize LLM with automatic download if needed"""
50
+ # Here we use from_pretrained so that if the model is missing locally it downloads it.
 
 
 
 
 
 
 
 
51
  return Llama.from_pretrained(
52
  repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
53
  filename=model_name,
 
58
  start_time = time.time()
59
 
60
  # Open PDF
61
+ try:
62
+ doc = fitz.open(pdf_path)
63
+ except Exception as e:
64
+ logger.error(f"Failed to open PDF: {e}")
65
+ raise RuntimeError("Cannot open PDF file.") from e
66
+
67
  text_blocks = []
68
  tables = []
69
 
70
+ # Extract text and tables from each page
71
  for page_num, page in enumerate(doc):
72
+ # Extract text blocks from page and filter out very short blocks (noise)
73
+ blocks = self._extract_text_blocks(page)
74
+ filtered = [block for block in blocks if len(block.strip()) >= 10]
75
+ logger.debug(f"Page {page_num + 1}: Extracted {len(blocks)} blocks, {len(filtered)} kept after filtering.")
76
+ text_blocks.extend(filtered)
77
 
78
+ # Extract tables (if any)
79
  tables.extend(self._extract_tables(page, page_num))
80
 
81
+ # Process text blocks with LLM to extract product information
82
  products = []
83
+ for idx, block in enumerate(text_blocks):
84
+ # Log the text block for debugging
85
+ logger.debug(f"Processing text block {idx}: {block[:100]}...")
86
  product = self._process_text_block(block)
87
  if product:
88
  product.tables = tables
89
+ # Only add if at least one key (like name) is non-empty
90
+ if product.name or product.description or product.price or (
91
+ product.attributes and len(product.attributes) > 0):
92
+ products.append(product.to_dict())
93
+ else:
94
+ logger.debug(f"LLM returned empty product for block {idx}.")
95
+ else:
96
+ logger.debug(f"No product extracted from block {idx}.")
97
 
98
  logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
99
  return {"products": products, "tables": tables}
100
 
101
  def _extract_text_blocks(self, page) -> List[str]:
102
+ """Extract text blocks from a PDF page using PyMuPDF's blocks method."""
103
  blocks = []
104
  for block in page.get_text("blocks"):
105
+ # block[4] contains the text content
106
+ text = block[4].strip()
107
+ if text:
108
+ blocks.append(text)
109
  return blocks
110
 
111
  def _extract_tables(self, page, page_num: int) -> List[Dict]:
112
+ """Extract tables from a PDF page using PyMuPDF's table extraction (if available)."""
113
  tables = []
114
  try:
115
  tab = page.find_tables()
116
+ if tab and hasattr(tab, 'tables') and tab.tables:
117
+ for table in tab.tables:
118
  table_data = table.extract()
119
  if table_data:
120
  tables.append({
 
124
  "content": table_data
125
  })
126
  except Exception as e:
127
+ logger.warning(f"Error extracting tables from page {page_num + 1}: {e}")
128
  return tables
129
 
130
  def _process_text_block(self, text: str) -> Optional[ProductSpec]:
131
+ """Process a text block with LLM to extract product specifications."""
132
  prompt = self._generate_query_prompt(text)
133
+ logger.debug(f"Generated prompt: {prompt[:200]}...")
134
  try:
135
  response = self.llm.create_chat_completion(
136
  messages=[{"role": "user", "content": prompt}],
137
  temperature=0.1,
138
  max_tokens=512
139
  )
140
+ # Debug: log raw response
141
+ logger.debug(f"LLM raw response: {response}")
142
  return self._parse_response(response['choices'][0]['message']['content'])
143
  except Exception as e:
144
  logger.warning(f"Error processing text block: {e}")
145
  return None
146
 
147
  def _generate_query_prompt(self, text: str) -> str:
148
+ """Generate a prompt instructing the LLM to extract product information."""
149
+ return f"""Extract product specifications from the following text. If no product is found, return an empty JSON object with keys.\n\nText:\n{text}\n\nReturn JSON format exactly as:\n{{\n \"name\": \"product name\",\n \"description\": \"product description\",\n \"price\": numeric_price,\n \"attributes\": {{ \"key\": \"value\" }}\n}}"""
 
 
 
 
 
 
 
 
 
150
 
151
  def _parse_response(self, response: str) -> Optional[ProductSpec]:
152
+ """Parse the LLM's response to extract a product specification."""
153
  try:
154
  json_start = response.find('{')
155
  json_end = response.rfind('}') + 1
156
+ json_str = response[json_start:json_end].strip()
157
+ if not json_str:
158
+ raise ValueError("No JSON content found in response.")
159
+ data = json.loads(json_str)
160
+ # If the returned JSON is essentially empty, return None
161
+ if all(not data.get(key) for key in ['name', 'description', 'price', 'attributes']):
162
+ return None
163
  return ProductSpec(
164
  name=data.get('name', ''),
165
  description=data.get('description'),
166
  price=data.get('price'),
167
  attributes=data.get('attributes', {})
168
  )
169
+ except (json.JSONDecodeError, KeyError, ValueError) as e:
170
+ logger.warning(f"Parse error: {e} in response: {response}")
171
  return None
172
 
173
 
 
179
  except Exception as e:
180
  logger.error(f"Processing failed: {e}")
181
  return {}, "Error processing PDF"
182
+
183
+
184
+ if __name__ == "__main__":
185
+ # Example usage: change this if you call process_pdf_catalog elsewhere
186
+ pdf_path = "path/to/your/pdf_file.pdf"
187
+ result, message = process_pdf_catalog(pdf_path)
188
+ print(result, message)