Spaces:

garage-lab
/

MCP_HTML2JSON

Building

abdo-Mansour commited on 3 days ago

Commit

7a692a6

1 Parent(s): 6fcfebc

pain

Files changed (5) hide show

requirements.txt CHANGED Viewed

@@ -24,3 +24,4 @@ markdownify
 beautifulsoup4
 readabilipy
 docling

 beautifulsoup4
 readabilipy
 docling
+htmlrag

web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ

web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ

web2json/ai_extractor.py CHANGED Viewed

@@ -240,7 +240,7 @@ class NvidiaLLMClient(LLMClient):
             model=self.model_name,
             messages=[{"role": "user", "content": prompt}],
             temperature=self.temperature,
-            # top_p=self.top_p,
             max_tokens=self.max_tokens
             # stream is omitted (defaults to False)
         )
@@ -380,6 +380,8 @@ class ModalRerankerClient(RerankerClient):
             Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
             for p, norm in filtered
         ]
         return docs
 class HFRerankerClient(LLMClient):
@@ -567,7 +569,7 @@ class LLMClassifierExtractor(AIExtractor):
         # print(f"Generated prompt for extraction: {prompt[:500]}...")
         llm_response = self.llm_client.call_api(prompt)
         # print(f"LLM response: {llm_response[:500]}...")
         return llm_response or "{}"

             model=self.model_name,
             messages=[{"role": "user", "content": prompt}],
             temperature=self.temperature,
+            top_p=self.top_p,
             max_tokens=self.max_tokens
             # stream is omitted (defaults to False)
         )
             Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
             for p, norm in filtered
         ]
         return docs
 class HFRerankerClient(LLMClient):
         # print(f"Generated prompt for extraction: {prompt[:500]}...")
         llm_response = self.llm_client.call_api(prompt)
         # print(f"LLM response: {llm_response[:500]}...")
         return llm_response or "{}"

web2json/preprocessor.py CHANGED Viewed

@@ -3,6 +3,7 @@ import requests
 from bs4 import BeautifulSoup , Comment
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Optional
 class HTMLCleaner:
     DEFAULT_REMOVE_TAGS = [
@@ -212,7 +213,8 @@ class BasicPreprocessor(Preprocessor):
             'extra_remove_tags': ['header', 'footer']
         })
         clean = cleaner._clean_html(html_content=html_content)
         return clean.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace

 from bs4 import BeautifulSoup , Comment
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Optional
+from htmlrag import clean_html
 class HTMLCleaner:
     DEFAULT_REMOVE_TAGS = [
             'extra_remove_tags': ['header', 'footer']
         })
         clean = cleaner._clean_html(html_content=html_content)
+        clean = clean_html(clean)
+        # clean = clean_html(html_content)
         return clean.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace