Spaces:
Building
Building
eh
Browse files
app.py
CHANGED
@@ -3,15 +3,32 @@ import pandas as pd
|
|
3 |
import gradio as gr
|
4 |
from typing import Dict, Any, Type
|
5 |
from web2json.preprocessor import BasicPreprocessor
|
6 |
-
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
|
7 |
from web2json.postprocessor import PostProcessor
|
8 |
from web2json.pipeline import Pipeline
|
9 |
from pydantic import BaseModel, Field, create_model
|
10 |
import os
|
11 |
import dotenv
|
|
|
|
|
|
|
12 |
|
13 |
dotenv.load_dotenv()
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def parse_schema_input(schema_input: str) -> Type[BaseModel]:
|
16 |
"""
|
17 |
Convert user schema input to a Pydantic BaseModel.
|
@@ -170,66 +187,19 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
|
|
170 |
- Preserve the original formatting and context where relevant
|
171 |
- Return the extracted data in the format specified by the schema"""
|
172 |
|
173 |
-
classification_prompt_template =
|
174 |
-
# HTML Chunk Relevance Classification Prompt
|
175 |
-
|
176 |
-
You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
|
177 |
-
|
178 |
-
## Instructions:
|
179 |
-
1. Carefully examine the provided HTML chunk
|
180 |
-
2. Compare it against the given schema/criteria
|
181 |
-
3. Determine if the HTML chunk contains content that matches or is relevant to the schema
|
182 |
-
4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
|
183 |
-
|
184 |
-
## Input Format:
|
185 |
-
**Schema/Criteria:**
|
186 |
-
{schema}
|
187 |
-
|
188 |
-
**HTML Chunk:**
|
189 |
-
```html
|
190 |
-
{content}
|
191 |
-
```
|
192 |
-
|
193 |
-
## Output Format:
|
194 |
-
Your response must be ONLY a valid JSON object with no additional text:
|
195 |
-
|
196 |
-
```json
|
197 |
-
{{
|
198 |
-
"relevant": 1
|
199 |
-
}}
|
200 |
-
```
|
201 |
-
|
202 |
-
OR
|
203 |
-
|
204 |
-
```json
|
205 |
-
{{
|
206 |
-
"relevant": 0
|
207 |
-
}}
|
208 |
-
```
|
209 |
-
|
210 |
-
## Classification Rules:
|
211 |
-
- Output 1 if the HTML chunk contains content that matches the schema criteria
|
212 |
-
- Output 0 if the HTML chunk does not contain relevant content
|
213 |
-
- Consider semantic meaning, not just exact keyword matches
|
214 |
-
- Look at text content, attributes, structure, and context
|
215 |
-
- Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
|
216 |
-
- Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
|
217 |
-
- Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
|
218 |
-
- The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
|
219 |
-
|
220 |
-
CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
|
221 |
-
"""
|
222 |
# Initialize pipeline components
|
223 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
224 |
preprocessor = BasicPreprocessor(config={'keep_tags': True})
|
225 |
try:
|
226 |
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
227 |
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
|
|
|
228 |
except Exception as e:
|
229 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
230 |
|
231 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
232 |
-
ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
|
233 |
postprocessor = PostProcessor()
|
234 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
235 |
|
|
|
3 |
import gradio as gr
|
4 |
from typing import Dict, Any, Type
|
5 |
from web2json.preprocessor import BasicPreprocessor
|
6 |
+
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient
|
7 |
from web2json.postprocessor import PostProcessor
|
8 |
from web2json.pipeline import Pipeline
|
9 |
from pydantic import BaseModel, Field, create_model
|
10 |
import os
|
11 |
import dotenv
|
12 |
+
import random
|
13 |
+
import numpy as np
|
14 |
+
import torch
|
15 |
|
16 |
dotenv.load_dotenv()
|
17 |
|
18 |
+
def seed_everything(seed=42):
|
19 |
+
random.seed(seed)
|
20 |
+
np.random.seed(seed)
|
21 |
+
torch.manual_seed(seed)
|
22 |
+
|
23 |
+
if torch.cuda.is_available():
|
24 |
+
torch.cuda.manual_seed(seed)
|
25 |
+
torch.cuda.manual_seed_all(seed) # if using multi-GPU
|
26 |
+
|
27 |
+
torch.backends.cudnn.deterministic = True
|
28 |
+
torch.backends.cudnn.benchmark = False
|
29 |
+
|
30 |
+
seed_everything(22)
|
31 |
+
|
32 |
def parse_schema_input(schema_input: str) -> Type[BaseModel]:
|
33 |
"""
|
34 |
Convert user schema input to a Pydantic BaseModel.
|
|
|
187 |
- Preserve the original formatting and context where relevant
|
188 |
- Return the extracted data in the format specified by the schema"""
|
189 |
|
190 |
+
classification_prompt_template = schema.model_json_schema()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
# Initialize pipeline components
|
192 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
193 |
preprocessor = BasicPreprocessor(config={'keep_tags': True})
|
194 |
try:
|
195 |
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
196 |
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
|
197 |
+
reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})
|
198 |
except Exception as e:
|
199 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
200 |
|
201 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
202 |
+
ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
|
203 |
postprocessor = PostProcessor()
|
204 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
205 |
|
test.ipynb
DELETED
The diff for this file is too large to render.
See raw diff
|
|
web2json/__pycache__/ai_extractor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
|
|
web2json/__pycache__/pipeline.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
|
|
web2json/__pycache__/postprocessor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/postprocessor.cpython-311.pyc and b/web2json/__pycache__/postprocessor.cpython-311.pyc differ
|
|
web2json/__pycache__/preprocessor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
|
|
web2json/ai_extractor.py
CHANGED
@@ -232,6 +232,7 @@ class NvidiaLLMClient(LLMClient):
|
|
232 |
Returns:
|
233 |
str: The generated text from the NVIDIA API.
|
234 |
"""
|
|
|
235 |
response = self.client.chat.completions.create(
|
236 |
model=self.model_name,
|
237 |
messages=[{"role": "user", "content": prompt}],
|
@@ -286,50 +287,38 @@ class NvidiaRerankerClient(RerankerClient):
|
|
286 |
self.model_name = model_name
|
287 |
|
288 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
289 |
-
def rerank(self, query: str, passages: List[str], top_k: int = 3
|
290 |
# 1. Prepare and send documents for scoring
|
291 |
docs = [Document(page_content=p) for p in passages]
|
292 |
-
# print("Bonjour")
|
293 |
-
# print(type(docs),docs)
|
294 |
-
# print(type(query),query)
|
295 |
scored_docs = self.client.compress_documents(
|
296 |
query=str(query),
|
297 |
documents=docs
|
298 |
)
|
299 |
-
|
300 |
-
# 2. Extract raw scores
|
301 |
raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
-
# 3. Softmax normalization
|
304 |
-
exp_scores = np.exp(raw_scores - np.max(raw_scores))
|
305 |
-
softmax_scores = exp_scores / exp_scores.sum()
|
306 |
|
307 |
-
# 4. (Optional) Min–Max rescale of the softmax outputs
|
308 |
-
min_val, max_val = raw_scores.min(), raw_scores.max()
|
309 |
-
if max_val > min_val:
|
310 |
-
minmax_scores = (raw_scores - min_val) / (max_val - min_val)
|
311 |
-
else:
|
312 |
-
# all scores equal → set them all to 1
|
313 |
-
minmax_scores = np.ones_like(raw_scores)
|
314 |
-
|
315 |
-
# 5. Attach new scores back to metadata
|
316 |
-
for doc, s, mm in zip(scored_docs, softmax_scores, minmax_scores):
|
317 |
-
doc.metadata['softmax_score'] = float(s)
|
318 |
-
doc.metadata['minmax_score'] = float(mm)
|
319 |
-
|
320 |
-
# 6. Sort and return top_k by softmax_score
|
321 |
-
# Sort by softmax_score descending
|
322 |
-
sorted_docs = sorted(
|
323 |
-
scored_docs,
|
324 |
-
key=lambda d: d.metadata['softmax_score'],
|
325 |
-
reverse=True
|
326 |
-
)
|
327 |
-
# print("Ayeeeee")
|
328 |
-
# print("Docs Value:",sorted_docs)
|
329 |
-
# Filter by threshold
|
330 |
-
filtered_docs = [doc for doc in sorted_docs if doc.metadata['minmax_score'] >= threshold]
|
331 |
-
# print("Final", filtered_docs)
|
332 |
-
return filtered_docs
|
333 |
|
334 |
|
335 |
# TODO: will I need it ?
|
@@ -353,32 +342,56 @@ class HFRerankerClient(LLMClient):
|
|
353 |
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
|
354 |
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
|
355 |
|
356 |
-
def rerank(self, query: str, passages: List[str], top_k: int = 3) -> List[str]:
|
357 |
"""
|
358 |
-
Rerank passages based on relevance to query.
|
359 |
|
360 |
Args:
|
361 |
query (str): Query string.
|
362 |
passages (List[str]): List of passages.
|
363 |
top_k (int): Number of top passages to return.
|
|
|
364 |
|
365 |
Returns:
|
366 |
-
List[str]: Top-k most relevant passages.
|
367 |
"""
|
368 |
-
inputs = [
|
|
|
|
|
|
|
369 |
scores = []
|
370 |
|
371 |
with torch.no_grad():
|
372 |
for inp in inputs:
|
373 |
logits = self.model(**inp).logits
|
|
|
374 |
score = torch.softmax(logits, dim=1)[0, 1].item() # probability of relevance
|
375 |
-
scores.append(score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
|
377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
|
379 |
-
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
|
380 |
-
print(f"top indices: {top_indices}")
|
381 |
-
return [passages[i] for i in top_indices]
|
382 |
|
383 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
384 |
def call_api(self, prompt: str) -> str:
|
@@ -457,6 +470,7 @@ class LLMClassifierExtractor(AIExtractor):
|
|
457 |
# print("Using Hugging Face reranker for classification.")
|
458 |
return self.reranker.rerank(query, passages, top_k=top_k)
|
459 |
response = self.reranker.rerank(query,passages)
|
|
|
460 |
# print("DONNNNE")
|
461 |
# NVIDIA reranker path
|
462 |
return response
|
@@ -476,7 +490,8 @@ class LLMClassifierExtractor(AIExtractor):
|
|
476 |
# print(f"Content successfully chunked: {chunks}")
|
477 |
classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
|
478 |
# extracting the content
|
479 |
-
|
|
|
480 |
# print(f"Classified Chunks {len(classified_chunks)}")
|
481 |
# print(classified_chunks)
|
482 |
# print('='*80)
|
|
|
232 |
Returns:
|
233 |
str: The generated text from the NVIDIA API.
|
234 |
"""
|
235 |
+
print("prompt: ", prompt)
|
236 |
response = self.client.chat.completions.create(
|
237 |
model=self.model_name,
|
238 |
messages=[{"role": "user", "content": prompt}],
|
|
|
287 |
self.model_name = model_name
|
288 |
|
289 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
290 |
+
def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[Document]:
|
291 |
# 1. Prepare and send documents for scoring
|
292 |
docs = [Document(page_content=p) for p in passages]
|
|
|
|
|
|
|
293 |
scored_docs = self.client.compress_documents(
|
294 |
query=str(query),
|
295 |
documents=docs
|
296 |
)
|
297 |
+
|
298 |
+
# 2. Extract raw scores and compute sigmoid probabilities
|
299 |
raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
|
300 |
+
print(f"raw scores {raw_scores}")
|
301 |
+
p_scores = 1 / (1 + np.exp(-raw_scores))
|
302 |
+
print(f"Sigmoid scores: {p_scores}")
|
303 |
+
|
304 |
+
# 3. Min-max normalization
|
305 |
+
min_score = np.min(p_scores)
|
306 |
+
max_score = np.max(p_scores)
|
307 |
+
if max_score == min_score:
|
308 |
+
norm_scores = np.ones_like(p_scores) # All values same — normalize to 1
|
309 |
+
else:
|
310 |
+
norm_scores = (p_scores - min_score) / (max_score - min_score)
|
311 |
+
print(f"Normalized scores: {norm_scores}")
|
312 |
+
|
313 |
+
# 4. Filter by threshold using normalized scores
|
314 |
+
scored_pairs = [(doc, norm) for doc, norm in zip(scored_docs, norm_scores) if norm > threshold]
|
315 |
+
print(f"Filtered pairs:\n{scored_pairs}")
|
316 |
+
|
317 |
+
# 5. Return top_k documents (already sorted by model, no need to re-sort)
|
318 |
+
top_docs = [doc.page_content for doc, _ in scored_pairs]
|
319 |
+
return top_docs
|
320 |
|
|
|
|
|
|
|
321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
|
324 |
# TODO: will I need it ?
|
|
|
342 |
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
|
343 |
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
|
344 |
|
345 |
+
def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[str]:
|
346 |
"""
|
347 |
+
Rerank passages based on relevance to query using min-max normalized scores.
|
348 |
|
349 |
Args:
|
350 |
query (str): Query string.
|
351 |
passages (List[str]): List of passages.
|
352 |
top_k (int): Number of top passages to return.
|
353 |
+
threshold (float): Minimum normalized score to include passage.
|
354 |
|
355 |
Returns:
|
356 |
+
List[str]: Top-k most relevant passages above threshold.
|
357 |
"""
|
358 |
+
inputs = [
|
359 |
+
self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device)
|
360 |
+
for p in passages
|
361 |
+
]
|
362 |
scores = []
|
363 |
|
364 |
with torch.no_grad():
|
365 |
for inp in inputs:
|
366 |
logits = self.model(**inp).logits
|
367 |
+
# print("logits:", logits)
|
368 |
score = torch.softmax(logits, dim=1)[0, 1].item() # probability of relevance
|
369 |
+
scores.append(score)
|
370 |
+
|
371 |
+
print(f"Softmax Scores: {scores}")
|
372 |
+
|
373 |
+
# Min-max normalize the scores
|
374 |
+
scores_np = np.array(scores)
|
375 |
+
min_score = scores_np.min()
|
376 |
+
max_score = scores_np.max()
|
377 |
+
if max_score == min_score:
|
378 |
+
norm_scores = np.ones_like(scores_np)
|
379 |
+
else:
|
380 |
+
norm_scores = (scores_np - min_score) / (max_score - min_score)
|
381 |
|
382 |
+
print(f"Normalized Scores: {norm_scores}")
|
383 |
+
# Filter based on normalized threshold
|
384 |
+
filtered = [(i, s) for i, s in enumerate(norm_scores) if s > threshold]
|
385 |
+
print(f"Filtered: {filtered}")
|
386 |
+
|
387 |
+
# Sort by normalized score descending
|
388 |
+
filtered.sort(key=lambda x: x[1], reverse=True)
|
389 |
+
|
390 |
+
# Select top_k passages
|
391 |
+
top_passages = [passages[i] for i, _ in filtered]
|
392 |
+
|
393 |
+
return top_passages
|
394 |
|
|
|
|
|
|
|
395 |
|
396 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
397 |
def call_api(self, prompt: str) -> str:
|
|
|
470 |
# print("Using Hugging Face reranker for classification.")
|
471 |
return self.reranker.rerank(query, passages, top_k=top_k)
|
472 |
response = self.reranker.rerank(query,passages)
|
473 |
+
print(f"response: {response}")
|
474 |
# print("DONNNNE")
|
475 |
# NVIDIA reranker path
|
476 |
return response
|
|
|
490 |
# print(f"Content successfully chunked: {chunks}")
|
491 |
classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
|
492 |
# extracting the content
|
493 |
+
|
494 |
+
# classified_chunks = [chunk.page_content for chunk in classified_chunks]
|
495 |
# print(f"Classified Chunks {len(classified_chunks)}")
|
496 |
# print(classified_chunks)
|
497 |
# print('='*80)
|