abdo-Mansour commited on
Commit
251790a
·
1 Parent(s): 7924dcb

a little suprise

Browse files
app.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import gradio as gr
4
  from typing import Dict, Any, Type
5
  from web2json.preprocessor import BasicPreprocessor
6
- from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
  from pydantic import BaseModel, Field, create_model
@@ -185,7 +185,10 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
185
  - Follow the exact structure and data types specified in the schema
186
  - If a required field cannot be found, indicate this clearly
187
  - Preserve the original formatting and context where relevant
188
- - Return the extracted data in the format specified by the schema"""
 
 
 
189
 
190
  classification_prompt_template = schema.model_json_schema()
191
  # Initialize pipeline components
@@ -194,7 +197,8 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
194
  try:
195
  # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
196
  llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
197
- reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})
 
198
  except Exception as e:
199
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
200
 
 
3
  import gradio as gr
4
  from typing import Dict, Any, Type
5
  from web2json.preprocessor import BasicPreprocessor
6
+ from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
  from pydantic import BaseModel, Field, create_model
 
185
  - Follow the exact structure and data types specified in the schema
186
  - If a required field cannot be found, indicate this clearly
187
  - Preserve the original formatting and context where relevant
188
+ - Return the extracted data in the format specified by the schema
189
+ - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
190
+ - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
191
+ - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
192
 
193
  classification_prompt_template = schema.model_json_schema()
194
  # Initialize pipeline components
 
197
  try:
198
  # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
199
  llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
200
+ # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
201
+ reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-rerank.modal.run")
202
  except Exception as e:
203
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
204
 
requirements.txt CHANGED
@@ -15,4 +15,12 @@ openai
15
  html_chunking
16
  langchain_nvidia_ai_endpoints
17
  langchain_core
18
- lxml
 
 
 
 
 
 
 
 
 
15
  html_chunking
16
  langchain_nvidia_ai_endpoints
17
  langchain_core
18
+ lxml
19
+ pdfkit
20
+ html2text
21
+ inscriptis
22
+ trafilatura
23
+ markdownify
24
+ beautifulsoup4
25
+ readabilipy
26
+ docling
web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
 
web2json/__pycache__/pipeline.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
 
web2json/__pycache__/postprocessor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/postprocessor.cpython-311.pyc and b/web2json/__pycache__/postprocessor.cpython-311.pyc differ
 
web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
 
web2json/ai_extractor.py CHANGED
@@ -23,6 +23,9 @@ import requests
23
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
24
  import torch
25
  from typing import List, Dict
 
 
 
26
 
27
  class LLMClient(ABC):
28
  """
@@ -208,9 +211,9 @@ class NvidiaLLMClient(LLMClient):
208
 
209
  # Store generation settings with sensible defaults
210
  gen_conf = config.get("generation_config", {})
211
- self.temperature = gen_conf.get("temperature", 0.1)
212
  self.top_p = gen_conf.get("top_p", 0.7)
213
- self.max_tokens = gen_conf.get("max_tokens", 512)
214
 
215
  def set_model(self, model_name: str):
216
  """
@@ -237,7 +240,7 @@ class NvidiaLLMClient(LLMClient):
237
  model=self.model_name,
238
  messages=[{"role": "user", "content": prompt}],
239
  temperature=self.temperature,
240
- top_p=self.top_p,
241
  max_tokens=self.max_tokens
242
  # stream is omitted (defaults to False)
243
  )
@@ -301,13 +304,12 @@ class NvidiaRerankerClient(RerankerClient):
301
  p_scores = 1 / (1 + np.exp(-raw_scores))
302
  print(f"Sigmoid scores: {p_scores}")
303
 
304
- # 3. Min-max normalization
305
- min_score = np.min(p_scores)
306
  max_score = np.max(p_scores)
307
- if max_score == min_score:
308
- norm_scores = np.ones_like(p_scores) # All values same — normalize to 1
309
  else:
310
- norm_scores = (p_scores - min_score) / (max_score - min_score)
311
  print(f"Normalized scores: {norm_scores}")
312
 
313
  # 4. Filter by threshold using normalized scores
@@ -325,6 +327,60 @@ class NvidiaRerankerClient(RerankerClient):
325
  # def call_batch(self, prompts, max_workers=8):
326
  # pass
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
  class HFRerankerClient(LLMClient):
330
  """
@@ -485,16 +541,22 @@ class LLMClassifierExtractor(AIExtractor):
485
  hf (bool): Whether to use the Hugging Face reranker or NVIDIA (default).
486
  """
487
  # print("TIME TO EXTRACT")
488
- chunks = self.chunk_content(content, max_tokens=1000)
489
- # print(f"Content successfully chunked into {len(chunks)}.")
490
  # print(f"Content successfully chunked: {chunks}")
 
 
491
  classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
492
  # extracting the content
493
 
494
- # classified_chunks = [chunk.page_content for chunk in classified_chunks]
495
- # print(f"Classified Chunks {len(classified_chunks)}")
 
496
  # print(classified_chunks)
497
  # print('='*80)
 
 
 
498
  filtered_content = "\n\n".join(classified_chunks)
499
 
500
  if not filtered_content:
 
23
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
24
  import torch
25
  from typing import List, Dict
26
+ from tenacity import retry, wait_exponential, stop_after_attempt
27
+ import trafilatura
28
+
29
 
30
  class LLMClient(ABC):
31
  """
 
211
 
212
  # Store generation settings with sensible defaults
213
  gen_conf = config.get("generation_config", {})
214
+ self.temperature = gen_conf.get("temperature", 0)
215
  self.top_p = gen_conf.get("top_p", 0.7)
216
+ self.max_tokens = gen_conf.get("max_tokens", 8192)
217
 
218
  def set_model(self, model_name: str):
219
  """
 
240
  model=self.model_name,
241
  messages=[{"role": "user", "content": prompt}],
242
  temperature=self.temperature,
243
+ # top_p=self.top_p,
244
  max_tokens=self.max_tokens
245
  # stream is omitted (defaults to False)
246
  )
 
304
  p_scores = 1 / (1 + np.exp(-raw_scores))
305
  print(f"Sigmoid scores: {p_scores}")
306
 
307
+ # 3. Max normalization
 
308
  max_score = np.max(p_scores)
309
+ if max_score == 0:
310
+ norm_scores = np.zeros_like(p_scores)
311
  else:
312
+ norm_scores = p_scores / max_score
313
  print(f"Normalized scores: {norm_scores}")
314
 
315
  # 4. Filter by threshold using normalized scores
 
327
  # def call_batch(self, prompts, max_workers=8):
328
  # pass
329
 
330
+ def retry_on_error(fn):
331
+ """Simple retry decorator (exponential back-off, max 6 tries)."""
332
+ return retry(
333
+ wait=wait_exponential(multiplier=0.5, min=0.5, max=5),
334
+ stop=stop_after_attempt(6),
335
+ reraise=True,
336
+ )(fn)
337
+
338
+
339
+ class ModalRerankerClient(RerankerClient):
340
+ """Client for the Modal Qwen3-Reranker endpoint (non-streaming)."""
341
+
342
+ def __init__(self, endpoint_url: str):
343
+ self.endpoint_url = endpoint_url.rstrip("/") # ensure no trailing slash
344
+
345
+ def set_endpoint(self, url: str):
346
+ self.endpoint_url = url.rstrip("/")
347
+
348
+ @retry_on_error
349
+ def rerank(
350
+ self,
351
+ query: str,
352
+ passages: List[str],
353
+ threshold: float = 0.5,
354
+ ) -> List[Document]:
355
+ """Call the remote endpoint and return filtered passages."""
356
+ if not isinstance(query,str):
357
+ query = str(query)
358
+ payload = {"query": query, "passages": passages}
359
+ print(payload)
360
+ res = requests.post(self.endpoint_url, json=payload, timeout=60)
361
+ res.raise_for_status()
362
+ data = res.json()
363
+
364
+ # The endpoint already returns probabilities (0-1). Extract them.
365
+ ranked = data.get("ranked_passages", [])
366
+ # Extract scores
367
+ scores = np.array([p["score"] for p in ranked], dtype=float)
368
+ # Max normalization
369
+ max_score = scores.max() if len(scores) > 0 else 1.0
370
+ if max_score == 0:
371
+ norm_scores = np.zeros_like(scores)
372
+ else:
373
+ norm_scores = scores / max_score
374
+ # Filter by threshold using normalized scores
375
+ filtered = [
376
+ (p, norm) for p, norm in zip(ranked, norm_scores) if norm >= threshold
377
+ ]
378
+ # Convert to LangChain Documents
379
+ docs = [
380
+ Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
381
+ for p, norm in filtered
382
+ ]
383
+ return docs
384
 
385
  class HFRerankerClient(LLMClient):
386
  """
 
541
  hf (bool): Whether to use the Hugging Face reranker or NVIDIA (default).
542
  """
543
  # print("TIME TO EXTRACT")
544
+ chunks = self.chunk_content(content, max_tokens=500)
545
+ print(f"Content successfully chunked into {len(chunks)}.")
546
  # print(f"Content successfully chunked: {chunks}")
547
+ # chunks = [trafilatura.extract(chunk,favor_recall=True) for chunk in chunks]
548
+ # chunks = [chunk for chunk in chunks if chunk is not None]
549
  classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
550
  # extracting the content
551
 
552
+ if isinstance(classified_chunks[0],Document):
553
+ classified_chunks = [chunk.page_content for chunk in classified_chunks]
554
+ print(f"Classified Chunks {len(classified_chunks)}")
555
  # print(classified_chunks)
556
  # print('='*80)
557
+ # NOTE: More preprocesing
558
+ # classified_chunks = [trafilatura.extract(chunk,favor_recall=True) for chunk in classified_chunks]
559
+ # classified_chunks = [chunk for chunk in classified_chunks if chunk is not None]
560
  filtered_content = "\n\n".join(classified_chunks)
561
 
562
  if not filtered_content:
web2json/contentextractors.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import pdfkit
5
+ import requests
6
+ import warnings
7
+ import tempfile
8
+ # import textract
9
+ import html2text
10
+ import inscriptis
11
+ import trafilatura
12
+ from pathlib import Path
13
+ from markdownify import markdownify
14
+ from json_repair import repair_json
15
+ from bs4 import BeautifulSoup, Comment
16
+ from html_chunking import get_html_chunks
17
+ from urllib.error import URLError, HTTPError
18
+ from html_to_markdown import convert_to_markdown
19
+ from readabilipy import simple_json_from_html_string
20
+ from docling.document_converter import DocumentConverter
21
+ from dateparser_scripts.update_supported_languages_and_locales import to_string
22
+
23
+
24
+ def clean_html(html_content: str) -> str:
25
+ """
26
+ Cleans up the given HTML content by:
27
+ - Removing <script> and <style> tags and their content.
28
+ - Removing HTML comments.
29
+ - Extracting and returning the visible text with normalized whitespace.
30
+
31
+ Args:
32
+ html_content (str): The HTML content to clean.
33
+
34
+ Returns:
35
+ str: The cleaned, visible text from the HTML.
36
+ """
37
+ # Parse the HTML content
38
+ soup = BeautifulSoup(html_content, "html.parser")
39
+
40
+ # Remove script and style elements
41
+ # Remove unwanted tags
42
+ for tag in soup(["script", "style", "img", "a", "table", "tr", "td", "th", "thead", "tbody",
43
+ "tfoot", "header", "footer", "link", "rel"]):
44
+ tag.decompose()
45
+
46
+ # Remove elements that do not contain any visible text
47
+ for element in soup.find_all():
48
+ # If the element has no text (after stripping whitespace), remove it
49
+ if not element.get_text(strip=True):
50
+ element.decompose()
51
+
52
+ # Remove HTML comments
53
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
54
+ comment.extract()
55
+
56
+ # Extract text and normalize whitespace
57
+ # text = soup.get_text(separator=" ", strip=True)
58
+ # clean_text = re.sub(r'\s+', ' ', text)
59
+
60
+ # return clean_text
61
+ return str(soup)
62
+
63
+
64
+ def print_content_extractors():
65
+ print(
66
+ [
67
+ "Default: the plain text of the HTML page",
68
+ "Inscriptis",
69
+ "Trafilatura",
70
+ ]
71
+ )
72
+
73
+
74
+ class ContentExtractor:
75
+ def get_text(self, html):
76
+ return clean_html(html)
77
+
78
+ # TODO: Clean this mess
79
+ def url_to_html(self, url,clean=False):
80
+ # Define custom headers to mimic a browser request
81
+ headers = {
82
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
83
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
84
+ "Accept-Language": "en-US,en;q=0.6",
85
+ "Cache-Control": "max-age=0",
86
+ "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
87
+ "Sec-Ch-Ua-Mobile": "?0",
88
+ "Sec-Ch-Ua-Platform": "\"Windows\"",
89
+ "Sec-Fetch-Dest": "document",
90
+ "Sec-Fetch-Mode": "navigate",
91
+ "Sec-Fetch-Site": "none",
92
+ "Sec-Fetch-User": "?1",
93
+ "Upgrade-Insecure-Requests": "1"
94
+ }
95
+
96
+ try:
97
+ # Create a Request object with custom headers
98
+ response = requests.get(url, headers=headers, timeout=10)
99
+
100
+ html = None
101
+
102
+ if response.status_code == 200:
103
+ html = response.text
104
+ else:
105
+ print(f"Failed to retrieve HTML. Status code: {response.status_code}")
106
+ return None
107
+
108
+ if clean:
109
+ return self.get_text(html)
110
+
111
+ return html
112
+
113
+ except HTTPError as e:
114
+ print(f"HTTP Error: {e.code} - {e.reason}")
115
+ return None
116
+ except URLError as e:
117
+ print(f"URL Error: {e.reason}")
118
+ return None
119
+ except Exception as e:
120
+ print(f"An unexpected error occurred: {e}")
121
+ return None
122
+
123
+
124
+ class Inscriptis(ContentExtractor):
125
+ def __init__(self):
126
+ super()
127
+ self.headers = {
128
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Brave/119.0.0.0",
129
+ "Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
130
+ }
131
+
132
+ warnings.warn("\nBeware, put only clean links with no trackers, or it may produce unexpected results.")
133
+
134
+ def get_text(self, html):
135
+ """Extract text from HTML using inscriptis."""
136
+ return inscriptis.get_text(html)
137
+
138
+ def url_to_html(self, url):
139
+ response = requests.get(url, headers=self.headers)
140
+ return response.text
141
+
142
+
143
+ class Docling(ContentExtractor):
144
+ def __init__(self):
145
+ super().__init__()
146
+
147
+ # TODO: This is an unexpected behaviour but due to docling docs website being down, it's what works for now
148
+ def get_text(self, text_content):
149
+ result = None
150
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
151
+ tmpfile.write(text_content)
152
+ tmpfile.flush()
153
+ tmpfile_path = tmpfile.name.replace("\\", "/")
154
+ tmpfile_path = Path(tmpfile_path)
155
+ try:
156
+ converter = DocumentConverter()
157
+ document = converter.convert(tmpfile_path).document
158
+ tables = []
159
+ for table_ix, table in enumerate(document.tables):
160
+ table_text = table.export_to_markdown()
161
+ tables.append(table_text)
162
+
163
+ result = document.export_to_markdown()
164
+ for table in tables:
165
+ result += "\n\n" + table
166
+ finally:
167
+ os.remove(tmpfile_path)
168
+ return result
169
+
170
+
171
+ class ReadabiliPy(ContentExtractor):
172
+ def __init__(self):
173
+ super().__init__()
174
+
175
+ def get_text(self, html):
176
+ content = simple_json_from_html_string(html, use_readability=True)
177
+ json_object = json.dumps(content, indent=4)
178
+ repaired = repair_json(json_object)
179
+ return repaired
180
+
181
+
182
+ class Trafilatura(ContentExtractor):
183
+ def __init__(self):
184
+ super().__init__()
185
+ self.headers = {
186
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
187
+ "Accept-Language": "en-US,en;q=0.9",
188
+ }
189
+
190
+ warnings.warn("\nTrafilatura Content Extractor: Beware, put only clean links with no trackers, or it may produce unexpected results.")
191
+
192
+ from copy import deepcopy
193
+ from trafilatura.settings import DEFAULT_CONFIG
194
+ config = deepcopy(DEFAULT_CONFIG)
195
+ # config['DEFAULT']['MIN_EXTRACTED_SIZE'] = '5000' # Configurable but this value worked well for me
196
+ self.config = config
197
+
198
+ def url_to_html(self, url):
199
+ response = requests.get(url, headers=self.headers)
200
+ return response.text
201
+
202
+ def get_text(self, html, output_format="markdown", min_extracted_size_char=20_000):
203
+ # self.config['DEFAULT']['MIN_EXTRACTED_SIZE'] = f"{min_extracted_size_char}"
204
+ # self.config['DEFAULT']['MIN_OUTPUT_SIZE'] = f"{min_extracted_size_char}"
205
+ return trafilatura.extract(filecontent=html, favor_recall=True, config=self.config, output_format=output_format)
206
+
207
+
208
+ class Markdownify(ContentExtractor):
209
+ def get_text(self, html):
210
+ alt = re.sub(r"\n{3,}", "\n\n", html)
211
+ md = markdownify(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])
212
+
213
+ md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
214
+ # Remove extra newlines
215
+ md = re.sub(r"\n{3,}", "\n\n", md)
216
+ md = md.strip()
217
+
218
+ return md
219
+
220
+
221
+ class HTML2Text(ContentExtractor):
222
+ def get_text(self, html):
223
+ converter = html2text.HTML2Text()
224
+ converter.ignore_tables=True
225
+ converter.ignore_links=True
226
+ converter.ignore_images=True
227
+ converter.ignore_mailto_links=True
228
+ return converter.handle(html)
229
+
230
+
231
+ class HTML_TO_Markdown(ContentExtractor):
232
+ def get_text(self, html):
233
+ alt = re.sub(r"\n{3,}", "\n\n", html)
234
+ md = convert_to_markdown(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])
235
+
236
+ md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
237
+ # Remove extra newlines
238
+ md = re.sub(r"\n{3,}", "\n\n", md)
239
+ md = md.strip()
240
+
241
+ return md
242
+
243
+
244
+ class PDFkitDocling(ContentExtractor):
245
+ def get_text(self, html):
246
+ soup = BeautifulSoup(html, "html.parser")
247
+
248
+ # Remove <a>, <link>, <img>, and other unwanted tags
249
+ for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
250
+ tag.decompose()
251
+
252
+ # Remove HTML comments
253
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
254
+ comment.extract()
255
+
256
+
257
+ content = str(soup)
258
+
259
+ # PDF path to save
260
+ pdf_path = 'test.pdf'
261
+
262
+ # Create PDF
263
+ pdfkit.from_string(content, pdf_path)
264
+
265
+ converter = DocumentConverter()
266
+
267
+ return converter.convert(pdf_path).document.export_to_markdown()
268
+
269
+
270
+ class TrafilatraCHUNKS(ContentExtractor):
271
+ def __init__(self):
272
+ super().__init__()
273
+ # self.trafi = Trafilatura()
274
+
275
+ def get_text(self, html, max_tokens=1000):
276
+ soup = BeautifulSoup(html, "html.parser")
277
+
278
+ # Remove <a>, <link>, <img>, and other unwanted tags
279
+ for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
280
+ tag.decompose()
281
+
282
+ # Remove HTML comments
283
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
284
+ comment.extract()
285
+
286
+
287
+ content = str(soup)
288
+
289
+ chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
290
+
291
+ cleaned = [trafilatura.extract(chunk) for chunk in chunks]
292
+ cleaned = [chunk for chunk in cleaned if chunk is not None]
293
+
294
+
295
+ combined_text = ""
296
+ for chunk in cleaned:
297
+ if chunk is None:
298
+ continue
299
+ combined_text += chunk + "\n"
300
+
301
+ return combined_text
302
+
303
+
304
+ class TrafilaCHUNKSRobust(ContentExtractor):
305
+ def __init__(self):
306
+ super().__init__()
307
+ # self.trafi = Trafilatura()
308
+
309
+ def get_text(self, html, max_tokens=1000):
310
+ soup = BeautifulSoup(html, "html.parser")
311
+
312
+ for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
313
+ tag.decompose()
314
+
315
+ for tag in soup.find_all(lambda tag: tag.attrs and any("nav" in str(v) for v in tag.attrs.values())):
316
+ tag.decompose()
317
+
318
+ # Remove HTML comments
319
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
320
+ comment.extract()
321
+
322
+ content = str(soup)
323
+
324
+ chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
325
+
326
+ cleaned = [trafilatura.extract(chunk) for chunk in chunks]
327
+ cleaned = [chunk for chunk in cleaned if chunk is not None]
328
+
329
+ combined_text = ""
330
+ for chunk in cleaned:
331
+ if chunk is None:
332
+ continue
333
+ combined_text += chunk + "\n"
334
+
335
+ return combined_text
336
+
337
+ class TrafilaCHUNKSRobustV2(ContentExtractor):
338
+ def __init__(self):
339
+ super().__init__()
340
+ # self.trafi = Trafilatura()
341
+
342
+ def get_text(self, html, max_tokens=1000):
343
+ soup = BeautifulSoup(html, "html.parser")
344
+
345
+ for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
346
+ tag.decompose()
347
+
348
+ # Remove HTML comments
349
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
350
+ comment.extract()
351
+
352
+ content = str(soup)
353
+
354
+ chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
355
+
356
+ cleaned = [trafilatura.extract(chunk) for chunk in chunks]
357
+ cleaned = [chunk for chunk in cleaned if chunk is not None]
358
+
359
+ combined_text = ""
360
+ for chunk in cleaned:
361
+ if chunk is None:
362
+ continue
363
+ combined_text += chunk + "\n"
364
+
365
+ return combined_text
366
+
367
+ # Very Bad lol
368
+ # class Textract(ContentExtractor):
369
+ # def get_text(self, html):
370
+ # with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
371
+ # tmpfile.write(html)
372
+ # tmpfile.flush()
373
+ # tmpfile_path = tmpfile.name.replace("\\", "/")
374
+ # tmpfile_path = Path(tmpfile_path)
375
+ # try:
376
+ # result = textract.process(tmpfile_path)
377
+ # finally:
378
+ # os.remove(tmpfile_path)
379
+ # return result