abdo-Mansour commited on
Commit
7a692a6
·
1 Parent(s): 6fcfebc
requirements.txt CHANGED
@@ -24,3 +24,4 @@ markdownify
24
  beautifulsoup4
25
  readabilipy
26
  docling
 
 
24
  beautifulsoup4
25
  readabilipy
26
  docling
27
+ htmlrag
web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
 
web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
 
web2json/ai_extractor.py CHANGED
@@ -240,7 +240,7 @@ class NvidiaLLMClient(LLMClient):
240
  model=self.model_name,
241
  messages=[{"role": "user", "content": prompt}],
242
  temperature=self.temperature,
243
- # top_p=self.top_p,
244
  max_tokens=self.max_tokens
245
  # stream is omitted (defaults to False)
246
  )
@@ -380,6 +380,8 @@ class ModalRerankerClient(RerankerClient):
380
  Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
381
  for p, norm in filtered
382
  ]
 
 
383
  return docs
384
 
385
  class HFRerankerClient(LLMClient):
@@ -567,7 +569,7 @@ class LLMClassifierExtractor(AIExtractor):
567
  # print(f"Generated prompt for extraction: {prompt[:500]}...")
568
  llm_response = self.llm_client.call_api(prompt)
569
  # print(f"LLM response: {llm_response[:500]}...")
570
-
571
  return llm_response or "{}"
572
 
573
 
 
240
  model=self.model_name,
241
  messages=[{"role": "user", "content": prompt}],
242
  temperature=self.temperature,
243
+ top_p=self.top_p,
244
  max_tokens=self.max_tokens
245
  # stream is omitted (defaults to False)
246
  )
 
380
  Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
381
  for p, norm in filtered
382
  ]
383
+
384
+
385
  return docs
386
 
387
  class HFRerankerClient(LLMClient):
 
569
  # print(f"Generated prompt for extraction: {prompt[:500]}...")
570
  llm_response = self.llm_client.call_api(prompt)
571
  # print(f"LLM response: {llm_response[:500]}...")
572
+
573
  return llm_response or "{}"
574
 
575
 
web2json/preprocessor.py CHANGED
@@ -3,6 +3,7 @@ import requests
3
  from bs4 import BeautifulSoup , Comment
4
  from abc import ABC, abstractmethod
5
  from typing import Any, Dict, Optional
 
6
 
7
  class HTMLCleaner:
8
  DEFAULT_REMOVE_TAGS = [
@@ -212,7 +213,8 @@ class BasicPreprocessor(Preprocessor):
212
  'extra_remove_tags': ['header', 'footer']
213
  })
214
  clean = cleaner._clean_html(html_content=html_content)
215
-
 
216
  return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
217
 
218
 
 
3
  from bs4 import BeautifulSoup , Comment
4
  from abc import ABC, abstractmethod
5
  from typing import Any, Dict, Optional
6
+ from htmlrag import clean_html
7
 
8
  class HTMLCleaner:
9
  DEFAULT_REMOVE_TAGS = [
 
213
  'extra_remove_tags': ['header', 'footer']
214
  })
215
  clean = cleaner._clean_html(html_content=html_content)
216
+ clean = clean_html(clean)
217
+ # clean = clean_html(html_content)
218
  return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
219
 
220