abdo-Mansour commited on
Commit
7924dcb
·
2 Parent(s): 98d5f67 b4fe9b6
app.py CHANGED
@@ -3,15 +3,32 @@ import pandas as pd
3
  import gradio as gr
4
  from typing import Dict, Any, Type
5
  from web2json.preprocessor import BasicPreprocessor
6
- from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
  from pydantic import BaseModel, Field, create_model
10
  import os
11
  import dotenv
 
 
 
12
 
13
  dotenv.load_dotenv()
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def parse_schema_input(schema_input: str) -> Type[BaseModel]:
16
  """
17
  Convert user schema input to a Pydantic BaseModel.
@@ -170,66 +187,19 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
170
  - Preserve the original formatting and context where relevant
171
  - Return the extracted data in the format specified by the schema"""
172
 
173
- classification_prompt_template = """
174
- # HTML Chunk Relevance Classification Prompt
175
-
176
- You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
177
-
178
- ## Instructions:
179
- 1. Carefully examine the provided HTML chunk
180
- 2. Compare it against the given schema/criteria
181
- 3. Determine if the HTML chunk contains content that matches or is relevant to the schema
182
- 4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
183
-
184
- ## Input Format:
185
- **Schema/Criteria:**
186
- {schema}
187
-
188
- **HTML Chunk:**
189
- ```html
190
- {content}
191
- ```
192
-
193
- ## Output Format:
194
- Your response must be ONLY a valid JSON object with no additional text:
195
-
196
- ```json
197
- {{
198
- "relevant": 1
199
- }}
200
- ```
201
-
202
- OR
203
-
204
- ```json
205
- {{
206
- "relevant": 0
207
- }}
208
- ```
209
-
210
- ## Classification Rules:
211
- - Output 1 if the HTML chunk contains content that matches the schema criteria
212
- - Output 0 if the HTML chunk does not contain relevant content
213
- - Consider semantic meaning, not just exact keyword matches
214
- - Look at text content, attributes, structure, and context
215
- - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
216
- - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
217
- - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
218
- - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
219
-
220
- CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
221
- """
222
  # Initialize pipeline components
223
  # TODO: improve the RAG system and optimize (don't instantiate every time)
224
  preprocessor = BasicPreprocessor(config={'keep_tags': True})
225
  try:
226
  # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
227
  llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
 
228
  except Exception as e:
229
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
230
 
231
  # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
232
- ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
233
  postprocessor = PostProcessor()
234
  pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
235
 
 
3
  import gradio as gr
4
  from typing import Dict, Any, Type
5
  from web2json.preprocessor import BasicPreprocessor
6
+ from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
  from pydantic import BaseModel, Field, create_model
10
  import os
11
  import dotenv
12
+ import random
13
+ import numpy as np
14
+ import torch
15
 
16
  dotenv.load_dotenv()
17
 
18
+ def seed_everything(seed=42):
19
+ random.seed(seed)
20
+ np.random.seed(seed)
21
+ torch.manual_seed(seed)
22
+
23
+ if torch.cuda.is_available():
24
+ torch.cuda.manual_seed(seed)
25
+ torch.cuda.manual_seed_all(seed) # if using multi-GPU
26
+
27
+ torch.backends.cudnn.deterministic = True
28
+ torch.backends.cudnn.benchmark = False
29
+
30
+ seed_everything(22)
31
+
32
  def parse_schema_input(schema_input: str) -> Type[BaseModel]:
33
  """
34
  Convert user schema input to a Pydantic BaseModel.
 
187
  - Preserve the original formatting and context where relevant
188
  - Return the extracted data in the format specified by the schema"""
189
 
190
+ classification_prompt_template = schema.model_json_schema()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  # Initialize pipeline components
192
  # TODO: improve the RAG system and optimize (don't instantiate every time)
193
  preprocessor = BasicPreprocessor(config={'keep_tags': True})
194
  try:
195
  # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
196
  llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
197
+ reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})
198
  except Exception as e:
199
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
200
 
201
  # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
202
+ ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
203
  postprocessor = PostProcessor()
204
  pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
205
 
test.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
 
web2json/__pycache__/pipeline.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
 
web2json/__pycache__/postprocessor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/postprocessor.cpython-311.pyc and b/web2json/__pycache__/postprocessor.cpython-311.pyc differ
 
web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
 
web2json/ai_extractor.py CHANGED
@@ -232,6 +232,7 @@ class NvidiaLLMClient(LLMClient):
232
  Returns:
233
  str: The generated text from the NVIDIA API.
234
  """
 
235
  response = self.client.chat.completions.create(
236
  model=self.model_name,
237
  messages=[{"role": "user", "content": prompt}],
@@ -286,50 +287,38 @@ class NvidiaRerankerClient(RerankerClient):
286
  self.model_name = model_name
287
 
288
  @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
289
- def rerank(self, query: str, passages: List[str], top_k: int = 3 , threshold: float = 0.5) -> List[Document]:
290
  # 1. Prepare and send documents for scoring
291
  docs = [Document(page_content=p) for p in passages]
292
- # print("Bonjour")
293
- # print(type(docs),docs)
294
- # print(type(query),query)
295
  scored_docs = self.client.compress_documents(
296
  query=str(query),
297
  documents=docs
298
  )
299
- # print(f"Scored Docs {scored_docs}")
300
- # 2. Extract raw scores
301
  raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
- # 3. Softmax normalization
304
- exp_scores = np.exp(raw_scores - np.max(raw_scores))
305
- softmax_scores = exp_scores / exp_scores.sum()
306
 
307
- # 4. (Optional) Min–Max rescale of the softmax outputs
308
- min_val, max_val = raw_scores.min(), raw_scores.max()
309
- if max_val > min_val:
310
- minmax_scores = (raw_scores - min_val) / (max_val - min_val)
311
- else:
312
- # all scores equal → set them all to 1
313
- minmax_scores = np.ones_like(raw_scores)
314
-
315
- # 5. Attach new scores back to metadata
316
- for doc, s, mm in zip(scored_docs, softmax_scores, minmax_scores):
317
- doc.metadata['softmax_score'] = float(s)
318
- doc.metadata['minmax_score'] = float(mm)
319
-
320
- # 6. Sort and return top_k by softmax_score
321
- # Sort by softmax_score descending
322
- sorted_docs = sorted(
323
- scored_docs,
324
- key=lambda d: d.metadata['softmax_score'],
325
- reverse=True
326
- )
327
- # print("Ayeeeee")
328
- # print("Docs Value:",sorted_docs)
329
- # Filter by threshold
330
- filtered_docs = [doc for doc in sorted_docs if doc.metadata['minmax_score'] >= threshold]
331
- # print("Final", filtered_docs)
332
- return filtered_docs
333
 
334
 
335
  # TODO: will I need it ?
@@ -353,32 +342,56 @@ class HFRerankerClient(LLMClient):
353
  self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
354
  self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
355
 
356
- def rerank(self, query: str, passages: List[str], top_k: int = 3) -> List[str]:
357
  """
358
- Rerank passages based on relevance to query.
359
 
360
  Args:
361
  query (str): Query string.
362
  passages (List[str]): List of passages.
363
  top_k (int): Number of top passages to return.
 
364
 
365
  Returns:
366
- List[str]: Top-k most relevant passages.
367
  """
368
- inputs = [self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device) for p in passages]
 
 
 
369
  scores = []
370
 
371
  with torch.no_grad():
372
  for inp in inputs:
373
  logits = self.model(**inp).logits
 
374
  score = torch.softmax(logits, dim=1)[0, 1].item() # probability of relevance
375
- scores.append(score)
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- # print(f"Scores for passages: {scores}")
 
 
 
 
 
 
 
 
 
 
 
378
 
379
- top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
380
- print(f"top indices: {top_indices}")
381
- return [passages[i] for i in top_indices]
382
 
383
  @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
384
  def call_api(self, prompt: str) -> str:
@@ -457,6 +470,7 @@ class LLMClassifierExtractor(AIExtractor):
457
  # print("Using Hugging Face reranker for classification.")
458
  return self.reranker.rerank(query, passages, top_k=top_k)
459
  response = self.reranker.rerank(query,passages)
 
460
  # print("DONNNNE")
461
  # NVIDIA reranker path
462
  return response
@@ -476,7 +490,8 @@ class LLMClassifierExtractor(AIExtractor):
476
  # print(f"Content successfully chunked: {chunks}")
477
  classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
478
  # extracting the content
479
- classified_chunks = [chunk.page_content for chunk in classified_chunks]
 
480
  # print(f"Classified Chunks {len(classified_chunks)}")
481
  # print(classified_chunks)
482
  # print('='*80)
 
232
  Returns:
233
  str: The generated text from the NVIDIA API.
234
  """
235
+ print("prompt: ", prompt)
236
  response = self.client.chat.completions.create(
237
  model=self.model_name,
238
  messages=[{"role": "user", "content": prompt}],
 
287
  self.model_name = model_name
288
 
289
  @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
290
+ def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[Document]:
291
  # 1. Prepare and send documents for scoring
292
  docs = [Document(page_content=p) for p in passages]
 
 
 
293
  scored_docs = self.client.compress_documents(
294
  query=str(query),
295
  documents=docs
296
  )
297
+
298
+ # 2. Extract raw scores and compute sigmoid probabilities
299
  raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
300
+ print(f"raw scores {raw_scores}")
301
+ p_scores = 1 / (1 + np.exp(-raw_scores))
302
+ print(f"Sigmoid scores: {p_scores}")
303
+
304
+ # 3. Min-max normalization
305
+ min_score = np.min(p_scores)
306
+ max_score = np.max(p_scores)
307
+ if max_score == min_score:
308
+ norm_scores = np.ones_like(p_scores) # All values same — normalize to 1
309
+ else:
310
+ norm_scores = (p_scores - min_score) / (max_score - min_score)
311
+ print(f"Normalized scores: {norm_scores}")
312
+
313
+ # 4. Filter by threshold using normalized scores
314
+ scored_pairs = [(doc, norm) for doc, norm in zip(scored_docs, norm_scores) if norm > threshold]
315
+ print(f"Filtered pairs:\n{scored_pairs}")
316
+
317
+ # 5. Return top_k documents (already sorted by model, no need to re-sort)
318
+ top_docs = [doc.page_content for doc, _ in scored_pairs]
319
+ return top_docs
320
 
 
 
 
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
 
324
  # TODO: will I need it ?
 
342
  self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
343
  self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
344
 
345
+ def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[str]:
346
  """
347
+ Rerank passages based on relevance to query using min-max normalized scores.
348
 
349
  Args:
350
  query (str): Query string.
351
  passages (List[str]): List of passages.
352
  top_k (int): Number of top passages to return.
353
+ threshold (float): Minimum normalized score to include passage.
354
 
355
  Returns:
356
+ List[str]: Top-k most relevant passages above threshold.
357
  """
358
+ inputs = [
359
+ self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device)
360
+ for p in passages
361
+ ]
362
  scores = []
363
 
364
  with torch.no_grad():
365
  for inp in inputs:
366
  logits = self.model(**inp).logits
367
+ # print("logits:", logits)
368
  score = torch.softmax(logits, dim=1)[0, 1].item() # probability of relevance
369
+ scores.append(score)
370
+
371
+ print(f"Softmax Scores: {scores}")
372
+
373
+ # Min-max normalize the scores
374
+ scores_np = np.array(scores)
375
+ min_score = scores_np.min()
376
+ max_score = scores_np.max()
377
+ if max_score == min_score:
378
+ norm_scores = np.ones_like(scores_np)
379
+ else:
380
+ norm_scores = (scores_np - min_score) / (max_score - min_score)
381
 
382
+ print(f"Normalized Scores: {norm_scores}")
383
+ # Filter based on normalized threshold
384
+ filtered = [(i, s) for i, s in enumerate(norm_scores) if s > threshold]
385
+ print(f"Filtered: {filtered}")
386
+
387
+ # Sort by normalized score descending
388
+ filtered.sort(key=lambda x: x[1], reverse=True)
389
+
390
+ # Select top_k passages
391
+ top_passages = [passages[i] for i, _ in filtered]
392
+
393
+ return top_passages
394
 
 
 
 
395
 
396
  @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
397
  def call_api(self, prompt: str) -> str:
 
470
  # print("Using Hugging Face reranker for classification.")
471
  return self.reranker.rerank(query, passages, top_k=top_k)
472
  response = self.reranker.rerank(query,passages)
473
+ print(f"response: {response}")
474
  # print("DONNNNE")
475
  # NVIDIA reranker path
476
  return response
 
490
  # print(f"Content successfully chunked: {chunks}")
491
  classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
492
  # extracting the content
493
+
494
+ # classified_chunks = [chunk.page_content for chunk in classified_chunks]
495
  # print(f"Classified Chunks {len(classified_chunks)}")
496
  # print(classified_chunks)
497
  # print('='*80)