Spaces:
Building
Building
Commit
·
7a692a6
1
Parent(s):
6fcfebc
pain
Browse files
requirements.txt
CHANGED
@@ -24,3 +24,4 @@ markdownify
|
|
24 |
beautifulsoup4
|
25 |
readabilipy
|
26 |
docling
|
|
|
|
24 |
beautifulsoup4
|
25 |
readabilipy
|
26 |
docling
|
27 |
+
htmlrag
|
web2json/__pycache__/ai_extractor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
|
|
web2json/__pycache__/preprocessor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
|
|
web2json/ai_extractor.py
CHANGED
@@ -240,7 +240,7 @@ class NvidiaLLMClient(LLMClient):
|
|
240 |
model=self.model_name,
|
241 |
messages=[{"role": "user", "content": prompt}],
|
242 |
temperature=self.temperature,
|
243 |
-
|
244 |
max_tokens=self.max_tokens
|
245 |
# stream is omitted (defaults to False)
|
246 |
)
|
@@ -380,6 +380,8 @@ class ModalRerankerClient(RerankerClient):
|
|
380 |
Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
|
381 |
for p, norm in filtered
|
382 |
]
|
|
|
|
|
383 |
return docs
|
384 |
|
385 |
class HFRerankerClient(LLMClient):
|
@@ -567,7 +569,7 @@ class LLMClassifierExtractor(AIExtractor):
|
|
567 |
# print(f"Generated prompt for extraction: {prompt[:500]}...")
|
568 |
llm_response = self.llm_client.call_api(prompt)
|
569 |
# print(f"LLM response: {llm_response[:500]}...")
|
570 |
-
|
571 |
return llm_response or "{}"
|
572 |
|
573 |
|
|
|
240 |
model=self.model_name,
|
241 |
messages=[{"role": "user", "content": prompt}],
|
242 |
temperature=self.temperature,
|
243 |
+
top_p=self.top_p,
|
244 |
max_tokens=self.max_tokens
|
245 |
# stream is omitted (defaults to False)
|
246 |
)
|
|
|
380 |
Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
|
381 |
for p, norm in filtered
|
382 |
]
|
383 |
+
|
384 |
+
|
385 |
return docs
|
386 |
|
387 |
class HFRerankerClient(LLMClient):
|
|
|
569 |
# print(f"Generated prompt for extraction: {prompt[:500]}...")
|
570 |
llm_response = self.llm_client.call_api(prompt)
|
571 |
# print(f"LLM response: {llm_response[:500]}...")
|
572 |
+
|
573 |
return llm_response or "{}"
|
574 |
|
575 |
|
web2json/preprocessor.py
CHANGED
@@ -3,6 +3,7 @@ import requests
|
|
3 |
from bs4 import BeautifulSoup , Comment
|
4 |
from abc import ABC, abstractmethod
|
5 |
from typing import Any, Dict, Optional
|
|
|
6 |
|
7 |
class HTMLCleaner:
|
8 |
DEFAULT_REMOVE_TAGS = [
|
@@ -212,7 +213,8 @@ class BasicPreprocessor(Preprocessor):
|
|
212 |
'extra_remove_tags': ['header', 'footer']
|
213 |
})
|
214 |
clean = cleaner._clean_html(html_content=html_content)
|
215 |
-
|
|
|
216 |
return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|
217 |
|
218 |
|
|
|
3 |
from bs4 import BeautifulSoup , Comment
|
4 |
from abc import ABC, abstractmethod
|
5 |
from typing import Any, Dict, Optional
|
6 |
+
from htmlrag import clean_html
|
7 |
|
8 |
class HTMLCleaner:
|
9 |
DEFAULT_REMOVE_TAGS = [
|
|
|
213 |
'extra_remove_tags': ['header', 'footer']
|
214 |
})
|
215 |
clean = cleaner._clean_html(html_content=html_content)
|
216 |
+
clean = clean_html(clean)
|
217 |
+
# clean = clean_html(html_content)
|
218 |
return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|
219 |
|
220 |
|