Spaces:
Running
Running
mriusero
commited on
Commit
·
698ce3e
1
Parent(s):
6a99b0e
fix: add binary files via LFS only
Browse files- .gitattributes +1 -0
- .gitignore +2 -1
- chroma_db/chroma.sqlite3 +3 -0
- chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/data_level0.bin +3 -0
- chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/header.bin +3 -0
- chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/length.bin +3 -0
- chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/link_lists.bin +0 -0
- requirements.txt +12 -1
- src/agent/mistral_agent.py +7 -1
- src/agent/tools/__init__.py +3 -1
- src/agent/tools/retrieve_knowledge.py +45 -0
- src/agent/tools/visit_webpage.py +58 -0
- src/agent/utils/vector_store.py +236 -0
- src/web2llm/__init__.py +0 -0
- src/web2llm/app/__init__.py +5 -0
- src/web2llm/app/api/__init__.py +7 -0
- src/web2llm/app/api/models.py +52 -0
- src/web2llm/app/api/routes.py +159 -0
- src/web2llm/app/api/server.py +85 -0
- src/web2llm/app/converter/__init__.py +7 -0
- src/web2llm/app/converter/converter.py +407 -0
- src/web2llm/app/main.py +282 -0
- src/web2llm/app/scraper/__init__.py +7 -0
- src/web2llm/app/scraper/scraper.py +475 -0
- src/web2llm/app/utils/__init__.py +3 -0
- tools.json +42 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
.DS_Store
|
2 |
.idea/
|
3 |
.env
|
4 |
-
__pycache__/
|
|
|
|
1 |
.DS_Store
|
2 |
.idea/
|
3 |
.env
|
4 |
+
__pycache__/
|
5 |
+
tests.py
|
chroma_db/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93417808fc00afc794b4c90d34635008d3120ef7768cbb920dc97a5d7f026032
|
3 |
+
size 876544
|
chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2679902f7ee9902bd54e85a1e4b822cccb4a163c0d49ae93b57d42d40edf49d0
|
3 |
+
size 42360000
|
chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f14d42069445548e1fceb9acb767255a21e1e9d11c021b2d5999d5cbf4d2b705
|
3 |
+
size 100
|
chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d23a36b9568bb6826f624d484e2f255a46e20e70a60792401044c7e37f93ccd
|
3 |
+
size 40000
|
chroma_db/d365f4bc-8099-45f4-bdc0-9c299960820d/link_lists.bin
ADDED
File without changes
|
requirements.txt
CHANGED
@@ -8,4 +8,15 @@ pandas
|
|
8 |
scipy
|
9 |
plotly
|
10 |
dotenv
|
11 |
-
mistralai
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
scipy
|
9 |
plotly
|
10 |
dotenv
|
11 |
+
mistralai
|
12 |
+
fastapi
|
13 |
+
uvicorn
|
14 |
+
pydantic
|
15 |
+
aiohttp
|
16 |
+
markdown
|
17 |
+
lxml[html_clean]
|
18 |
+
readability-lxml
|
19 |
+
chromadb
|
20 |
+
markdownify
|
21 |
+
html2markdown
|
22 |
+
smolagents
|
src/agent/mistral_agent.py
CHANGED
@@ -5,6 +5,8 @@ from mistralai import Mistral
|
|
5 |
from src.agent.utils.tooling import generate_tools_json
|
6 |
from src.agent.tools import (
|
7 |
calculate_sum,
|
|
|
|
|
8 |
)
|
9 |
|
10 |
load_dotenv()
|
@@ -14,10 +16,12 @@ class MistralAgent:
|
|
14 |
self.api_key = os.getenv("MISTRAL_API_KEY")
|
15 |
self.agent_id = os.getenv("AGENT_ID")
|
16 |
self.client = Mistral(api_key=self.api_key)
|
17 |
-
self.model = "mistral-
|
18 |
self.prompt = None
|
19 |
self.names_to_functions = {
|
20 |
"calculate_sum": calculate_sum,
|
|
|
|
|
21 |
}
|
22 |
self.tools = self.get_tools()
|
23 |
|
@@ -27,5 +31,7 @@ class MistralAgent:
|
|
27 |
return generate_tools_json(
|
28 |
[
|
29 |
calculate_sum,
|
|
|
|
|
30 |
]
|
31 |
).get('tools')
|
|
|
5 |
from src.agent.utils.tooling import generate_tools_json
|
6 |
from src.agent.tools import (
|
7 |
calculate_sum,
|
8 |
+
retrieve_knowledge,
|
9 |
+
visit_webpage,
|
10 |
)
|
11 |
|
12 |
load_dotenv()
|
|
|
16 |
self.api_key = os.getenv("MISTRAL_API_KEY")
|
17 |
self.agent_id = os.getenv("AGENT_ID")
|
18 |
self.client = Mistral(api_key=self.api_key)
|
19 |
+
self.model = "mistral-large"
|
20 |
self.prompt = None
|
21 |
self.names_to_functions = {
|
22 |
"calculate_sum": calculate_sum,
|
23 |
+
"retrieve_knowledge": retrieve_knowledge,
|
24 |
+
"visit_webpage": visit_webpage,
|
25 |
}
|
26 |
self.tools = self.get_tools()
|
27 |
|
|
|
31 |
return generate_tools_json(
|
32 |
[
|
33 |
calculate_sum,
|
34 |
+
retrieve_knowledge,
|
35 |
+
visit_webpage,
|
36 |
]
|
37 |
).get('tools')
|
src/agent/tools/__init__.py
CHANGED
@@ -1 +1,3 @@
|
|
1 |
-
from .calculator import calculate_sum
|
|
|
|
|
|
1 |
+
from .calculator import calculate_sum
|
2 |
+
from .retrieve_knowledge import retrieve_knowledge
|
3 |
+
from .visit_webpage import visit_webpage
|
src/agent/tools/retrieve_knowledge.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.agent.utils.tooling import tool
|
2 |
+
|
3 |
+
def format_the(query, results):
|
4 |
+
|
5 |
+
if results == "No relevant data found in the knowledge database. Have you checked any webpages or use any tools? If so, please try to find more relevant data.":
|
6 |
+
return results
|
7 |
+
else:
|
8 |
+
formatted_text = f"# Knowledge for '{query}' \n\n"
|
9 |
+
formatted_text += f"Fetched {len(results['documents'])} relevant documents.\n\n"
|
10 |
+
try:
|
11 |
+
for i in range(len(results['documents'])):
|
12 |
+
formatted_text += f"## Document {i + 1} ---\n"
|
13 |
+
formatted_text += f"- Content: '''\n{results['documents'][i]}\n'''\n"
|
14 |
+
formatted_text += f"- Metadata: {results['metadatas'][i]}\n"
|
15 |
+
formatted_text += f"---\n\n"
|
16 |
+
except Exception as e:
|
17 |
+
return f"Error: Index out of range. Please check the results structure. {str(e)}"
|
18 |
+
return formatted_text
|
19 |
+
|
20 |
+
@tool
|
21 |
+
def retrieve_knowledge(query: str, n_results: int = 2) -> str:
|
22 |
+
"""
|
23 |
+
Retrieves knowledge from a database with a provided query.
|
24 |
+
Args:
|
25 |
+
query (str): The query to search for in the vector store.
|
26 |
+
n_results (int, optional): The number of results to return. Default is 1.
|
27 |
+
"""
|
28 |
+
try:
|
29 |
+
from src.agent.utils.vector_store import retrieve_from_database
|
30 |
+
distance_threshold = 0.4
|
31 |
+
results = retrieve_from_database(
|
32 |
+
query=query,
|
33 |
+
n_results=n_results,
|
34 |
+
distance_threshold=distance_threshold
|
35 |
+
)
|
36 |
+
results_formatted = format_the(query, results)
|
37 |
+
if results_formatted:
|
38 |
+
return results_formatted
|
39 |
+
else:
|
40 |
+
return "No relevant data found in the knowledge database. Have you checked any webpages or use any tools? If so, please try to find more relevant data."
|
41 |
+
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Error retrieving knowledge: {e}")
|
44 |
+
return f"No relevant data found in the knowledge database. Have you checked any webpages or use any tools? If so, please try to find more relevant data."
|
45 |
+
|
src/agent/tools/visit_webpage.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.agent.utils.tooling import tool
|
2 |
+
from src.agent.utils.vector_store import chunk_content, load_in_vector_db
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
@tool
|
7 |
+
def visit_webpage(url: str) -> str:
|
8 |
+
"""
|
9 |
+
Visits a webpage at the given URL and reads its content as a markdown string.
|
10 |
+
This tool is useful for extracting information from web pages in a structured format after a search.
|
11 |
+
Args:
|
12 |
+
url (str): The URL of the webpage to visit.
|
13 |
+
"""
|
14 |
+
try:
|
15 |
+
from src.web2llm.app.scraper import scrape_url
|
16 |
+
from src.web2llm.app.converter import html_to_markdown
|
17 |
+
import re
|
18 |
+
import requests
|
19 |
+
from markdownify import markdownify
|
20 |
+
from requests.exceptions import RequestException
|
21 |
+
from smolagents.utils import truncate_content
|
22 |
+
from urllib.parse import urlparse
|
23 |
+
|
24 |
+
except ImportError as e:
|
25 |
+
raise ImportError(
|
26 |
+
f"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests` : {e}"
|
27 |
+
) from e
|
28 |
+
|
29 |
+
forbidden_domains = ["universetoday.com"]
|
30 |
+
|
31 |
+
parsed_url = urlparse(url)
|
32 |
+
domain = parsed_url.netloc
|
33 |
+
|
34 |
+
if domain in forbidden_domains:
|
35 |
+
return "This domain is forbidden and cannot be accessed, please try another one."
|
36 |
+
|
37 |
+
try:
|
38 |
+
# Web2LLM app
|
39 |
+
result = scrape_url(url, clean=True)
|
40 |
+
markdown_content = html_to_markdown(result["clean_html"])
|
41 |
+
|
42 |
+
load_in_vector_db(
|
43 |
+
markdown_content,
|
44 |
+
metadatas={
|
45 |
+
"title": result["title"],
|
46 |
+
"url": url,
|
47 |
+
}
|
48 |
+
)
|
49 |
+
return "The webpage has been successfully visited: content has been vectorized and stored in the knowledge base."
|
50 |
+
|
51 |
+
except requests.exceptions.Timeout:
|
52 |
+
return "The request timed out. Please try again later or check the URL."
|
53 |
+
|
54 |
+
except RequestException as e:
|
55 |
+
return f"Error fetching the webpage: {str(e)}"
|
56 |
+
|
57 |
+
except Exception as e:
|
58 |
+
return f"An unexpected error occurred: {str(e)}"
|
src/agent/utils/vector_store.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from mistralai import Mistral
|
4 |
+
import numpy as np
|
5 |
+
import time
|
6 |
+
import chromadb
|
7 |
+
from chromadb.config import Settings
|
8 |
+
import json
|
9 |
+
import hashlib
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
13 |
+
COLLECTION_NAME = "webpages_collection"
|
14 |
+
PERSIST_DIRECTORY = "./chroma_db"
|
15 |
+
|
16 |
+
def vectorize(input_texts, batch_size=5):
|
17 |
+
"""
|
18 |
+
Get the text embeddings for the given inputs using Mistral API.
|
19 |
+
"""
|
20 |
+
try:
|
21 |
+
client = Mistral(api_key=MISTRAL_API_KEY)
|
22 |
+
except Exception as e:
|
23 |
+
print(f"Error initializing Mistral client: {e}")
|
24 |
+
return []
|
25 |
+
|
26 |
+
embeddings = []
|
27 |
+
|
28 |
+
for i in range(0, len(input_texts), batch_size):
|
29 |
+
batch = input_texts[i:i + batch_size]
|
30 |
+
while True:
|
31 |
+
try:
|
32 |
+
embeddings_batch_response = client.embeddings.create(
|
33 |
+
model="mistral-embed",
|
34 |
+
inputs=batch
|
35 |
+
)
|
36 |
+
time.sleep(1)
|
37 |
+
embeddings.extend([data.embedding for data in embeddings_batch_response.data])
|
38 |
+
break
|
39 |
+
except Exception as e:
|
40 |
+
if "rate limit exceeded" in str(e).lower():
|
41 |
+
print("Rate limit exceeded. Retrying after 10 seconds...")
|
42 |
+
time.sleep(10)
|
43 |
+
else:
|
44 |
+
print(f"Error in embedding batch: {e}")
|
45 |
+
raise
|
46 |
+
|
47 |
+
return embeddings
|
48 |
+
|
49 |
+
|
50 |
+
def chunk_content(markdown_content, chunk_size=2048):
|
51 |
+
"""
|
52 |
+
Vectorizes the given markdown content into chunks of specified size without cutting sentences.
|
53 |
+
"""
|
54 |
+
def find_sentence_end(text, start):
|
55 |
+
"""Find the nearest sentence end from the start index."""
|
56 |
+
punctuations = {'.', '!', '?'}
|
57 |
+
end = start
|
58 |
+
while end < len(text) and text[end] not in punctuations:
|
59 |
+
end += 1
|
60 |
+
while end < len(text) and text[end] in punctuations:
|
61 |
+
end += 1
|
62 |
+
while end > start and text[end - 1] not in punctuations:
|
63 |
+
end -= 1
|
64 |
+
return end
|
65 |
+
|
66 |
+
chunks = []
|
67 |
+
start = 0
|
68 |
+
|
69 |
+
while start < len(markdown_content):
|
70 |
+
end = min(start + chunk_size, len(markdown_content))
|
71 |
+
end = find_sentence_end(markdown_content, end)
|
72 |
+
chunks.append(markdown_content[start:end].strip())
|
73 |
+
start = end
|
74 |
+
|
75 |
+
return chunks
|
76 |
+
|
77 |
+
|
78 |
+
def generate_chunk_id(chunk):
|
79 |
+
"""Generate a unique ID for a chunk using SHA-256 hash."""
|
80 |
+
return hashlib.sha256(chunk.encode('utf-8')).hexdigest()
|
81 |
+
|
82 |
+
|
83 |
+
def load_in_vector_db(markdown_content, metadatas=None, collection_name=COLLECTION_NAME):
|
84 |
+
"""
|
85 |
+
Load the text embeddings into a ChromaDB collection for efficient similarity search.
|
86 |
+
"""
|
87 |
+
try:
|
88 |
+
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error initializing ChromaDB client: {e}")
|
91 |
+
return
|
92 |
+
|
93 |
+
try:
|
94 |
+
if collection_name not in [col.name for col in client.list_collections()]:
|
95 |
+
collection = client.create_collection(collection_name)
|
96 |
+
else:
|
97 |
+
collection = client.get_collection(collection_name)
|
98 |
+
except Exception as e:
|
99 |
+
print(f"Error accessing collection: {e}")
|
100 |
+
return
|
101 |
+
|
102 |
+
try:
|
103 |
+
existing_items = collection.get()
|
104 |
+
except Exception as e:
|
105 |
+
print(f"Error retrieving existing items: {e}")
|
106 |
+
return
|
107 |
+
|
108 |
+
existing_ids = set()
|
109 |
+
|
110 |
+
if 'ids' in existing_items:
|
111 |
+
existing_ids.update(existing_items['ids'])
|
112 |
+
|
113 |
+
chunks = chunk_content(markdown_content)
|
114 |
+
text_to_vectorize = []
|
115 |
+
|
116 |
+
for chunk in chunks:
|
117 |
+
chunk_id = generate_chunk_id(chunk)
|
118 |
+
if chunk_id not in existing_ids:
|
119 |
+
text_to_vectorize.append(chunk)
|
120 |
+
|
121 |
+
print(f"New chunks to vectorize: {len(text_to_vectorize)}")
|
122 |
+
|
123 |
+
if text_to_vectorize:
|
124 |
+
embeddings = vectorize(text_to_vectorize)
|
125 |
+
for embedding, chunk in zip(embeddings, text_to_vectorize):
|
126 |
+
chunk_id = generate_chunk_id(chunk)
|
127 |
+
if chunk_id not in existing_ids:
|
128 |
+
try:
|
129 |
+
collection.add(
|
130 |
+
embeddings=[embedding],
|
131 |
+
documents=[chunk],
|
132 |
+
metadatas=[metadatas],
|
133 |
+
ids=[chunk_id]
|
134 |
+
)
|
135 |
+
existing_ids.add(chunk_id)
|
136 |
+
except Exception as e:
|
137 |
+
print(f"Error adding embedding to collection: {e}")
|
138 |
+
|
139 |
+
|
140 |
+
def retrieve_from_database(query, collection_name=COLLECTION_NAME, n_results=5, distance_threshold=None):
|
141 |
+
"""
|
142 |
+
Retrieve the most similar documents from the vector store based on the query.
|
143 |
+
"""
|
144 |
+
try:
|
145 |
+
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
|
146 |
+
collection = client.get_collection(collection_name)
|
147 |
+
except Exception as e:
|
148 |
+
print(f"Error accessing collection: {e}")
|
149 |
+
return
|
150 |
+
|
151 |
+
try:
|
152 |
+
query_embeddings = vectorize([query])
|
153 |
+
except Exception as e:
|
154 |
+
print(f"Error vectorizing query: {e}")
|
155 |
+
return
|
156 |
+
|
157 |
+
try:
|
158 |
+
raw_results = collection.query(
|
159 |
+
query_embeddings=query_embeddings,
|
160 |
+
n_results=n_results,
|
161 |
+
include=["documents", "metadatas", "distances"]
|
162 |
+
)
|
163 |
+
except Exception as e:
|
164 |
+
print(f"Error querying collection: {e}")
|
165 |
+
return
|
166 |
+
|
167 |
+
if distance_threshold is not None:
|
168 |
+
filtered_results = {
|
169 |
+
"ids": [],
|
170 |
+
"distances": [],
|
171 |
+
"metadatas": [],
|
172 |
+
"documents": []
|
173 |
+
}
|
174 |
+
for i, distance in enumerate(raw_results['distances'][0]):
|
175 |
+
if distance <= distance_threshold:
|
176 |
+
filtered_results['ids'].append(raw_results['ids'][0][i])
|
177 |
+
filtered_results['distances'].append(distance)
|
178 |
+
filtered_results['metadatas'].append(raw_results['metadatas'][0][i])
|
179 |
+
filtered_results['documents'].append(raw_results['documents'][0][i])
|
180 |
+
results = filtered_results
|
181 |
+
|
182 |
+
if len(results['documents']) == 0:
|
183 |
+
return "No relevant data found in the knowledge database. Have you checked any webpages? If so, please try to find more relevant data."
|
184 |
+
else:
|
185 |
+
return results
|
186 |
+
else:
|
187 |
+
return raw_results
|
188 |
+
|
189 |
+
|
190 |
+
def search_documents(collection_name=COLLECTION_NAME, query=None, query_embedding=None, metadata_filter=None, n_results=10):
|
191 |
+
"""
|
192 |
+
Search for documents in a ChromaDB collection.
|
193 |
+
|
194 |
+
:param collection_name: The name of the collection to search within.
|
195 |
+
:param query: The text query to search for (optional).
|
196 |
+
:param query_embedding: The embedding query to search for (optional).
|
197 |
+
:param metadata_filter: A filter to apply to the metadata (optional).
|
198 |
+
:param n_results: The number of results to return (default is 10).
|
199 |
+
:return: The search results.
|
200 |
+
"""
|
201 |
+
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
|
202 |
+
collection = client.get_collection(collection_name)
|
203 |
+
|
204 |
+
if query:
|
205 |
+
query_embedding = vectorize([query])[0]
|
206 |
+
|
207 |
+
if query_embedding:
|
208 |
+
results = collection.query(query_embeddings=[query_embedding], n_results=n_results, where=metadata_filter)
|
209 |
+
else:
|
210 |
+
results = collection.get(where=metadata_filter, limit=n_results)
|
211 |
+
|
212 |
+
return results
|
213 |
+
|
214 |
+
|
215 |
+
def delete_documents(collection_name=COLLECTION_NAME, ids=None):
|
216 |
+
"""
|
217 |
+
Delete documents from a ChromaDB collection based on their IDs.
|
218 |
+
|
219 |
+
:param collection_name: The name of the collection.
|
220 |
+
:param ids: A list of IDs of the documents to delete.
|
221 |
+
"""
|
222 |
+
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
|
223 |
+
collection = client.get_collection(collection_name)
|
224 |
+
|
225 |
+
collection.delete(ids=ids)
|
226 |
+
print(f"Documents with IDs {ids} have been deleted from the collection {collection_name}.")
|
227 |
+
|
228 |
+
def delete_collection(collection_name=COLLECTION_NAME):
|
229 |
+
"""
|
230 |
+
Delete a ChromaDB collection.
|
231 |
+
|
232 |
+
:param collection_name: The name of the collection to delete.
|
233 |
+
"""
|
234 |
+
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
|
235 |
+
client.delete_collection(collection_name)
|
236 |
+
print(f"Collection {collection_name} has been deleted.")
|
src/web2llm/__init__.py
ADDED
File without changes
|
src/web2llm/app/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Web Scraper et Convertisseur Markdown.
|
3 |
+
"""
|
4 |
+
|
5 |
+
__version__ = "0.1.0"
|
src/web2llm/app/api/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module API REST pour le Web Scraper et Convertisseur Markdown.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.web2llm.app.api.routes import router
|
6 |
+
|
7 |
+
__all__ = ['router']
|
src/web2llm/app/api/models.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modèles de données pour l'API.
|
3 |
+
"""
|
4 |
+
from typing import List, Optional, Dict, Union, Any
|
5 |
+
from pydantic import BaseModel, HttpUrl, validator, Field
|
6 |
+
|
7 |
+
|
8 |
+
class ScrapeRequest(BaseModel):
|
9 |
+
"""Modèle pour une requête de scraping."""
|
10 |
+
url: str = Field(..., description="URL à scraper")
|
11 |
+
save: bool = Field(False, description="Sauvegarder le résultat en fichier Markdown")
|
12 |
+
filename: Optional[str] = Field(None, description="Nom du fichier pour la sauvegarde")
|
13 |
+
clean: bool = Field(True, description="Nettoyer le HTML avant conversion")
|
14 |
+
|
15 |
+
@validator('url')
|
16 |
+
def url_must_be_valid(cls, v):
|
17 |
+
"""Validation de l'URL."""
|
18 |
+
if not v.startswith(('http://', 'https://')):
|
19 |
+
raise ValueError('URL doit commencer par http:// ou https://')
|
20 |
+
return v
|
21 |
+
|
22 |
+
|
23 |
+
class MultipleScrapeRequest(BaseModel):
|
24 |
+
"""Modèle pour une requête de scraping multiple."""
|
25 |
+
urls: List[str] = Field(..., description="Liste d'URLs à scraper")
|
26 |
+
save: bool = Field(True, description="Sauvegarder les résultats en fichiers Markdown")
|
27 |
+
|
28 |
+
@validator('urls')
|
29 |
+
def urls_must_be_valid(cls, v):
|
30 |
+
"""Validation des URLs."""
|
31 |
+
for url in v:
|
32 |
+
if not url.startswith(('http://', 'https://')):
|
33 |
+
raise ValueError(f'URL {url} doit commencer par http:// ou https://')
|
34 |
+
return v
|
35 |
+
|
36 |
+
|
37 |
+
class ScrapeResponse(BaseModel):
|
38 |
+
"""Modèle pour la réponse de scraping."""
|
39 |
+
url: str = Field(..., description="URL scrapée")
|
40 |
+
title: Optional[str] = Field(None, description="Titre de la page")
|
41 |
+
markdown: Optional[str] = Field(None, description="Contenu en Markdown")
|
42 |
+
saved: bool = Field(False, description="Indique si le fichier a été sauvegardé")
|
43 |
+
saved_path: Optional[str] = Field(None, description="Chemin du fichier sauvegardé")
|
44 |
+
success: bool = Field(..., description="Indique si le scraping a réussi")
|
45 |
+
error: Optional[str] = Field(None, description="Message d'erreur éventuel")
|
46 |
+
|
47 |
+
|
48 |
+
class MultipleScrapeResponse(BaseModel):
|
49 |
+
"""Modèle pour la réponse de scraping multiple."""
|
50 |
+
total: int = Field(..., description="Nombre total d'URLs traitées")
|
51 |
+
success: int = Field(..., description="Nombre d'URLs traitées avec succès")
|
52 |
+
results: List[ScrapeResponse] = Field(..., description="Résultats pour chaque URL")
|
src/web2llm/app/api/routes.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Routes de l'API.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from typing import Dict, List, Any
|
6 |
+
|
7 |
+
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
8 |
+
from fastapi.responses import JSONResponse, FileResponse
|
9 |
+
|
10 |
+
from app.main import WebToMarkdown
|
11 |
+
from app.api.models import (
|
12 |
+
ScrapeRequest, ScrapeResponse,
|
13 |
+
MultipleScrapeRequest, MultipleScrapeResponse
|
14 |
+
)
|
15 |
+
|
16 |
+
router = APIRouter()
|
17 |
+
processor = WebToMarkdown()
|
18 |
+
|
19 |
+
|
20 |
+
@router.post("/scrape", response_model=ScrapeResponse, tags=["Scraping"])
|
21 |
+
async def scrape_url(request: ScrapeRequest) -> Dict[str, Any]:
|
22 |
+
"""
|
23 |
+
Scrape une URL et convertit le contenu en Markdown.
|
24 |
+
|
25 |
+
- **url**: L'URL à scraper
|
26 |
+
- **save**: Si True, sauvegarde le résultat en fichier Markdown
|
27 |
+
- **filename**: Nom du fichier pour la sauvegarde (optionnel)
|
28 |
+
- **clean**: Si True, nettoie le HTML avant conversion
|
29 |
+
|
30 |
+
Retourne le contenu en Markdown et d'autres informations.
|
31 |
+
"""
|
32 |
+
result = processor.process_url(
|
33 |
+
url=request.url,
|
34 |
+
save=request.save,
|
35 |
+
filename=request.filename
|
36 |
+
)
|
37 |
+
|
38 |
+
if not result["success"]:
|
39 |
+
raise HTTPException(
|
40 |
+
status_code=500,
|
41 |
+
detail=f"Erreur lors du scraping: {result.get('error', 'Erreur inconnue')}"
|
42 |
+
)
|
43 |
+
|
44 |
+
return result
|
45 |
+
|
46 |
+
|
47 |
+
@router.post("/scrape/save", tags=["Scraping"])
|
48 |
+
async def scrape_and_save(request: ScrapeRequest) -> Dict[str, Any]:
|
49 |
+
"""
|
50 |
+
Scrape une URL, convertit en Markdown et sauvegarde dans un fichier.
|
51 |
+
|
52 |
+
- **url**: L'URL à scraper
|
53 |
+
- **filename**: Nom du fichier pour la sauvegarde (optionnel)
|
54 |
+
- **clean**: Si True, nettoie le HTML avant conversion
|
55 |
+
|
56 |
+
Retourne le chemin du fichier sauvegardé et d'autres informations.
|
57 |
+
"""
|
58 |
+
# Force la sauvegarde
|
59 |
+
request.save = True
|
60 |
+
|
61 |
+
result = processor.process_url(
|
62 |
+
url=request.url,
|
63 |
+
save=True,
|
64 |
+
filename=request.filename
|
65 |
+
)
|
66 |
+
|
67 |
+
if not result["success"]:
|
68 |
+
raise HTTPException(
|
69 |
+
status_code=500,
|
70 |
+
detail=f"Erreur lors du scraping: {result.get('error', 'Erreur inconnue')}"
|
71 |
+
)
|
72 |
+
|
73 |
+
if not result["saved"] or not result["saved_path"]:
|
74 |
+
raise HTTPException(
|
75 |
+
status_code=500,
|
76 |
+
detail="Échec de l'enregistrement du fichier"
|
77 |
+
)
|
78 |
+
|
79 |
+
return {
|
80 |
+
"success": True,
|
81 |
+
"file_path": result["saved_path"],
|
82 |
+
"title": result["title"],
|
83 |
+
"url": result["url"]
|
84 |
+
}
|
85 |
+
|
86 |
+
|
87 |
+
@router.post("/scrape/download", tags=["Scraping"])
|
88 |
+
async def scrape_and_download(request: ScrapeRequest) -> FileResponse:
|
89 |
+
"""
|
90 |
+
Scrape une URL, convertit en Markdown et renvoie directement le fichier.
|
91 |
+
|
92 |
+
- **url**: L'URL à scraper
|
93 |
+
- **filename**: Nom du fichier pour la sauvegarde (optionnel)
|
94 |
+
- **clean**: Si True, nettoie le HTML avant conversion
|
95 |
+
|
96 |
+
Retourne directement le fichier Markdown pour téléchargement.
|
97 |
+
"""
|
98 |
+
# Force la sauvegarde
|
99 |
+
request.save = True
|
100 |
+
|
101 |
+
result = processor.process_url(
|
102 |
+
url=request.url,
|
103 |
+
save=True,
|
104 |
+
filename=request.filename
|
105 |
+
)
|
106 |
+
|
107 |
+
if not result["success"]:
|
108 |
+
raise HTTPException(
|
109 |
+
status_code=500,
|
110 |
+
detail=f"Erreur lors du scraping: {result.get('error', 'Erreur inconnue')}"
|
111 |
+
)
|
112 |
+
|
113 |
+
if not result["saved"] or not result["saved_path"]:
|
114 |
+
raise HTTPException(
|
115 |
+
status_code=500,
|
116 |
+
detail="Échec de l'enregistrement du fichier"
|
117 |
+
)
|
118 |
+
|
119 |
+
return FileResponse(
|
120 |
+
path=result["saved_path"],
|
121 |
+
media_type="text/markdown",
|
122 |
+
filename=os.path.basename(result["saved_path"])
|
123 |
+
)
|
124 |
+
|
125 |
+
|
126 |
+
@router.post("/scrape/multiple", response_model=MultipleScrapeResponse, tags=["Scraping multiple"])
|
127 |
+
async def scrape_multiple_urls(
|
128 |
+
request: MultipleScrapeRequest,
|
129 |
+
background_tasks: BackgroundTasks
|
130 |
+
) -> Dict[str, Any]:
|
131 |
+
"""
|
132 |
+
Scrape plusieurs URLs en parallèle.
|
133 |
+
|
134 |
+
- **urls**: Liste d'URLs à scraper
|
135 |
+
- **save**: Si True, sauvegarde les résultats en fichiers Markdown
|
136 |
+
|
137 |
+
Retourne les résultats pour toutes les URLs.
|
138 |
+
"""
|
139 |
+
if len(request.urls) > 10:
|
140 |
+
# Pour de nombreuses URLs, traiter en arrière-plan
|
141 |
+
background_tasks.add_task(
|
142 |
+
processor.process_multiple_urls,
|
143 |
+
urls=request.urls,
|
144 |
+
save=request.save
|
145 |
+
)
|
146 |
+
return {
|
147 |
+
"total": len(request.urls),
|
148 |
+
"success": None, # Inconnu car traitement en arrière-plan
|
149 |
+
"results": [],
|
150 |
+
"message": f"Traitement de {len(request.urls)} URLs en arrière-plan"
|
151 |
+
}
|
152 |
+
|
153 |
+
# Pour peu d'URLs, traiter immédiatement
|
154 |
+
result = processor.process_multiple_urls(
|
155 |
+
urls=request.urls,
|
156 |
+
save=request.save
|
157 |
+
)
|
158 |
+
|
159 |
+
return result
|
src/web2llm/app/api/server.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration du serveur FastAPI.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
from fastapi import FastAPI, Request
|
7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
8 |
+
from fastapi.responses import JSONResponse
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
|
11 |
+
from app import __version__
|
12 |
+
from app.api.routes import router
|
13 |
+
|
14 |
+
# Chargement des variables d'environnement
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Configuration
|
18 |
+
API_HOST = os.getenv("API_HOST", "0.0.0.0")
|
19 |
+
API_PORT = int(os.getenv("API_PORT", 8000))
|
20 |
+
|
21 |
+
# Création de l'application FastAPI
|
22 |
+
app = FastAPI(
|
23 |
+
title="Web Scraper et Convertisseur Markdown API",
|
24 |
+
description="""
|
25 |
+
API pour scraper des sites web, nettoyer le contenu et le convertir en Markdown.
|
26 |
+
Idéal pour préparer des données pour les systèmes d'IA.
|
27 |
+
""",
|
28 |
+
version=__version__,
|
29 |
+
docs_url="/docs",
|
30 |
+
redoc_url="/redoc",
|
31 |
+
)
|
32 |
+
|
33 |
+
# Configuration CORS
|
34 |
+
app.add_middleware(
|
35 |
+
CORSMiddleware,
|
36 |
+
allow_origins=["*"], # Pour la production, limitez aux domaines autorisés
|
37 |
+
allow_credentials=True,
|
38 |
+
allow_methods=["*"],
|
39 |
+
allow_headers=["*"],
|
40 |
+
)
|
41 |
+
|
42 |
+
# Enregistrement des routes
|
43 |
+
app.include_router(router, prefix="/api")
|
44 |
+
|
45 |
+
# Gestionnaire d'exceptions
|
46 |
+
@app.exception_handler(Exception)
|
47 |
+
async def global_exception_handler(request: Request, exc: Exception):
|
48 |
+
"""Gestionnaire global des exceptions."""
|
49 |
+
logging.error(f"Exception non gérée: {str(exc)}")
|
50 |
+
return JSONResponse(
|
51 |
+
status_code=500,
|
52 |
+
content={"detail": f"Erreur interne du serveur: {str(exc)}"}
|
53 |
+
)
|
54 |
+
|
55 |
+
# Route racine
|
56 |
+
@app.get("/", tags=["Informations"])
|
57 |
+
async def root():
|
58 |
+
"""Page d'accueil de l'API."""
|
59 |
+
return {
|
60 |
+
"name": "Web Scraper et Convertisseur Markdown API",
|
61 |
+
"version": __version__,
|
62 |
+
"docs": "/docs",
|
63 |
+
"redoc": "/redoc"
|
64 |
+
}
|
65 |
+
|
66 |
+
# Vérification de la santé de l'API
|
67 |
+
@app.get("/health", tags=["Informations"])
|
68 |
+
async def health_check():
|
69 |
+
"""Vérification de la santé de l'API."""
|
70 |
+
return {"status": "ok", "version": __version__}
|
71 |
+
|
72 |
+
|
73 |
+
def start():
|
74 |
+
"""Démarrage du serveur avec uvicorn."""
|
75 |
+
import uvicorn
|
76 |
+
uvicorn.run(
|
77 |
+
"app.api.server:app",
|
78 |
+
host=API_HOST,
|
79 |
+
port=API_PORT,
|
80 |
+
reload=True
|
81 |
+
)
|
82 |
+
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
start()
|
src/web2llm/app/converter/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module de conversion HTML vers Markdown.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.web2llm.app.converter.converter import MarkdownConverter, html_to_markdown, save_markdown
|
6 |
+
|
7 |
+
__all__ = ['MarkdownConverter', 'html_to_markdown', 'save_markdown']
|
src/web2llm/app/converter/converter.py
ADDED
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module de conversion du HTML en Markdown.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
import re
|
7 |
+
from typing import Optional, Dict, Any
|
8 |
+
from html2markdown import convert
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import markdown
|
11 |
+
from urllib.parse import urlparse, urljoin
|
12 |
+
|
13 |
+
# Configuration du logging
|
14 |
+
logging.basicConfig(level=logging.INFO,
|
15 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
class MarkdownConverter:
|
19 |
+
"""Classe pour convertir le HTML en Markdown avec options de nettoyage avancées."""
|
20 |
+
|
21 |
+
def __init__(self, base_url: Optional[str] = None):
|
22 |
+
"""
|
23 |
+
Initialise le convertisseur.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
base_url: URL de base pour résoudre les liens relatifs
|
27 |
+
"""
|
28 |
+
self.base_url = base_url
|
29 |
+
|
30 |
+
def fix_relative_urls(self, html_content: str, base_url: Optional[str] = None) -> str:
|
31 |
+
"""
|
32 |
+
Remplace les URLs relatives par des URLs absolues.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
html_content: Le contenu HTML
|
36 |
+
base_url: L'URL de base pour résoudre les liens relatifs
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
HTML avec liens absolus
|
40 |
+
"""
|
41 |
+
if not base_url and not self.base_url:
|
42 |
+
return html_content
|
43 |
+
|
44 |
+
url_to_use = base_url if base_url else self.base_url
|
45 |
+
|
46 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
47 |
+
|
48 |
+
# Corriger les liens
|
49 |
+
for a_tag in soup.find_all('a', href=True):
|
50 |
+
if not a_tag['href'].startswith(('http://', 'https://', 'mailto:', 'tel:', '#')):
|
51 |
+
a_tag['href'] = urljoin(url_to_use, a_tag['href'])
|
52 |
+
|
53 |
+
# Corriger les images
|
54 |
+
for img_tag in soup.find_all('img', src=True):
|
55 |
+
if not img_tag['src'].startswith(('http://', 'https://', 'data:')):
|
56 |
+
img_tag['src'] = urljoin(url_to_use, img_tag['src'])
|
57 |
+
|
58 |
+
return str(soup)
|
59 |
+
|
60 |
+
def pre_process_html(self, html_content: str) -> str:
|
61 |
+
"""
|
62 |
+
Pré-traitement du HTML pour améliorer la conversion en Markdown.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
html_content: Le contenu HTML
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
HTML pré-traité
|
69 |
+
"""
|
70 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
71 |
+
|
72 |
+
# Supprimer tous les scripts et styles - Première passe critique
|
73 |
+
for element in soup.find_all(['script', 'style', 'noscript', 'iframe']):
|
74 |
+
element.decompose()
|
75 |
+
|
76 |
+
# Supprimer les attributs JavaScript inline et styles
|
77 |
+
for tag in soup.find_all(True):
|
78 |
+
# Liste pour stocker les attributs à supprimer
|
79 |
+
attrs_to_remove = []
|
80 |
+
|
81 |
+
for attr in tag.attrs:
|
82 |
+
# Supprimer style et attributs JavaScript
|
83 |
+
if attr == 'style' or attr.startswith('on'):
|
84 |
+
attrs_to_remove.append(attr)
|
85 |
+
|
86 |
+
# Supprimer les attributs identifiés
|
87 |
+
for attr in attrs_to_remove:
|
88 |
+
del tag[attr]
|
89 |
+
|
90 |
+
# Convertir les divs qui se comportent comme des paragraphes en paragraphes réels
|
91 |
+
for div in soup.find_all('div'):
|
92 |
+
if not div.find(['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'ul', 'ol']):
|
93 |
+
div.name = 'p'
|
94 |
+
|
95 |
+
# S'assurer que les listes sont correctement formatées
|
96 |
+
for ul in soup.find_all(['ul', 'ol']):
|
97 |
+
for child in ul.children:
|
98 |
+
if child.name != 'li' and child.name is not None:
|
99 |
+
# Convertir ou envelopper dans un li
|
100 |
+
if child.string and child.string.strip():
|
101 |
+
new_li = soup.new_tag('li')
|
102 |
+
child.wrap(new_li)
|
103 |
+
|
104 |
+
# Traiter les tableaux pour une meilleure conversion
|
105 |
+
for table in soup.find_all('table'):
|
106 |
+
# S'assurer que chaque tableau a un thead et tbody
|
107 |
+
if not table.find('thead'):
|
108 |
+
thead = soup.new_tag('thead')
|
109 |
+
first_tr = table.find('tr')
|
110 |
+
if first_tr:
|
111 |
+
first_tr.wrap(thead)
|
112 |
+
|
113 |
+
# S'assurer que tbody existe
|
114 |
+
if not table.find('tbody'):
|
115 |
+
tbody = soup.new_tag('tbody')
|
116 |
+
for tr in table.find_all('tr')[1:]:
|
117 |
+
tr.wrap(tbody)
|
118 |
+
|
119 |
+
# Nettoyer les balises span inutiles
|
120 |
+
for span in soup.find_all('span'):
|
121 |
+
if not span.attrs: # Si span n'a pas d'attributs
|
122 |
+
span.unwrap()
|
123 |
+
|
124 |
+
# Supprimer les objets JavaScript/Flash/etc.
|
125 |
+
for obj in soup.find_all(['object', 'embed']):
|
126 |
+
obj.decompose()
|
127 |
+
|
128 |
+
# Supprimer les formulaires (souvent inutiles pour l'extraction de contenu)
|
129 |
+
for form in soup.find_all('form'):
|
130 |
+
form.decompose()
|
131 |
+
|
132 |
+
# Retourner le HTML pré-traité
|
133 |
+
return str(soup)
|
134 |
+
|
135 |
+
def clean_markdown(self, markdown_content: str) -> str:
|
136 |
+
"""
|
137 |
+
Nettoie le markdown généré.
|
138 |
+
|
139 |
+
Args:
|
140 |
+
markdown_content: Le contenu Markdown
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
Markdown nettoyé
|
144 |
+
"""
|
145 |
+
# Supprimer les lignes vides consécutives
|
146 |
+
markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
|
147 |
+
|
148 |
+
# Nettoyer les liens qui ont pu être mal convertis
|
149 |
+
markdown_content = re.sub(r'\[(.+?)\]\s*\[\]', r'\1', markdown_content)
|
150 |
+
|
151 |
+
# Supprimer les blocs de scripts JavaScript
|
152 |
+
markdown_content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', markdown_content)
|
153 |
+
|
154 |
+
# Supprimer les blocs de style CSS
|
155 |
+
markdown_content = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', markdown_content)
|
156 |
+
|
157 |
+
# Supprimer les blocs CDATA qui pourraient contenir du JavaScript ou CSS
|
158 |
+
markdown_content = re.sub(r'<!\[CDATA\[[\s\S]*?\]\]>', '', markdown_content)
|
159 |
+
|
160 |
+
# Nettoyer TOUTES les balises HTML, pas seulement certaines
|
161 |
+
markdown_content = re.sub(r'</?[a-zA-Z][^>]*>', '', markdown_content)
|
162 |
+
|
163 |
+
# Nettoyer les balises <br> et les remplacer par des sauts de ligne
|
164 |
+
markdown_content = re.sub(r'<br\s*/?>', '\n', markdown_content)
|
165 |
+
|
166 |
+
# Nettoyer les espaces excessifs
|
167 |
+
markdown_content = re.sub(r' {2,}', ' ', markdown_content)
|
168 |
+
|
169 |
+
# Nettoyer les attributs HTML restants et toutes les balises avec leurs attributs
|
170 |
+
markdown_content = re.sub(r'<([a-z0-9]+)(?:\s+[a-z0-9-]+(?:=(?:"[^"]*"|\'[^\']*\'))?)*\s*>', '', markdown_content)
|
171 |
+
markdown_content = re.sub(r'</[a-z0-9]+>', '', markdown_content)
|
172 |
+
|
173 |
+
# Supprimer les commentaires HTML
|
174 |
+
markdown_content = re.sub(r'<!--[\s\S]*?-->', '', markdown_content)
|
175 |
+
|
176 |
+
# Supprimer tous les caractères d'échappement HTML comme
|
177 |
+
markdown_content = re.sub(r'&[a-zA-Z]+;', ' ', markdown_content)
|
178 |
+
|
179 |
+
# Supprimer les styles et scripts qui pourraient être intégrés dans des blocs de code
|
180 |
+
markdown_content = re.sub(r'```(?:javascript|js|css|style)[\s\S]*?```', '', markdown_content)
|
181 |
+
|
182 |
+
# Supprimer les lignes qui ressemblent à du CSS (propriété: valeur;)
|
183 |
+
markdown_content = re.sub(r'^[a-z-]+:\s*[^;]+;\s*$', '', markdown_content, flags=re.MULTILINE)
|
184 |
+
|
185 |
+
# Supprimer les lignes qui ressemblent à des déclarations JavaScript
|
186 |
+
markdown_content = re.sub(r'^var\s+[a-zA-Z0-9_$]+\s*=', '', markdown_content, flags=re.MULTILINE)
|
187 |
+
markdown_content = re.sub(r'^function\s+[a-zA-Z0-9_$]+\s*\(', '', markdown_content, flags=re.MULTILINE)
|
188 |
+
markdown_content = re.sub(r'^const\s+[a-zA-Z0-9_$]+\s*=', '', markdown_content, flags=re.MULTILINE)
|
189 |
+
markdown_content = re.sub(r'^let\s+[a-zA-Z0-9_$]+\s*=', '', markdown_content, flags=re.MULTILINE)
|
190 |
+
|
191 |
+
# Supprimer les accolades isolées qui pourraient provenir de code
|
192 |
+
markdown_content = re.sub(r'^\s*[{}]\s*$', '', markdown_content, flags=re.MULTILINE)
|
193 |
+
|
194 |
+
# Supprimer les doubles espaces après avoir enlevé les balises
|
195 |
+
markdown_content = re.sub(r' {2,}', ' ', markdown_content)
|
196 |
+
|
197 |
+
# Nettoyer les lignes vides multiples qui peuvent être créées après suppression des balises
|
198 |
+
markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
|
199 |
+
|
200 |
+
# Supprimer les lignes qui ne contiennent que des caractères non significatifs
|
201 |
+
markdown_content = re.sub(r'^\s*[;:.,_\-*+#]+\s*$', '', markdown_content, flags=re.MULTILINE)
|
202 |
+
|
203 |
+
return markdown_content.strip()
|
204 |
+
|
205 |
+
def html_to_markdown(self, html_content: str, url: Optional[str] = None) -> str:
|
206 |
+
"""
|
207 |
+
Convertit le HTML en Markdown.
|
208 |
+
|
209 |
+
Args:
|
210 |
+
html_content: Le contenu HTML
|
211 |
+
url: L'URL source pour résoudre les liens relatifs
|
212 |
+
|
213 |
+
Returns:
|
214 |
+
Contenu au format Markdown
|
215 |
+
"""
|
216 |
+
try:
|
217 |
+
# Pré-traiter le HTML
|
218 |
+
html_content = self.pre_process_html(html_content)
|
219 |
+
|
220 |
+
# Fixer les URLs relatives si une URL est fournie
|
221 |
+
base_url = url or self.base_url
|
222 |
+
if base_url:
|
223 |
+
html_content = self.fix_relative_urls(html_content, base_url)
|
224 |
+
|
225 |
+
# Approche 1: Utiliser html2markdown (la bibliothèque standard)
|
226 |
+
markdown_content_1 = convert(html_content)
|
227 |
+
markdown_content_1 = self.clean_markdown(markdown_content_1)
|
228 |
+
|
229 |
+
# Si le résultat semble bon, on le retourne
|
230 |
+
if not ('<' in markdown_content_1 and '>' in markdown_content_1):
|
231 |
+
return markdown_content_1
|
232 |
+
|
233 |
+
# Approche 2: Extraction directe avec BeautifulSoup
|
234 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
235 |
+
content_parts = []
|
236 |
+
|
237 |
+
# Ajouter le titre
|
238 |
+
if soup.title:
|
239 |
+
content_parts.append(f"# {soup.title.string.strip()}\n\n")
|
240 |
+
|
241 |
+
# Ajouter les titres et sous-titres
|
242 |
+
for i in range(1, 7):
|
243 |
+
for header in soup.find_all(f'h{i}'):
|
244 |
+
content_parts.append(f"{'#' * i} {header.get_text().strip()}\n\n")
|
245 |
+
|
246 |
+
# Ajouter les paragraphes
|
247 |
+
for p in soup.find_all('p'):
|
248 |
+
text = p.get_text().strip()
|
249 |
+
if text:
|
250 |
+
content_parts.append(f"{text}\n\n")
|
251 |
+
|
252 |
+
# Ajouter les listes non ordonnées
|
253 |
+
for ul in soup.find_all('ul'):
|
254 |
+
for li in ul.find_all('li'):
|
255 |
+
content_parts.append(f"* {li.get_text().strip()}\n")
|
256 |
+
content_parts.append("\n")
|
257 |
+
|
258 |
+
# Ajouter les listes ordonnées
|
259 |
+
for ol in soup.find_all('ol'):
|
260 |
+
for i, li in enumerate(ol.find_all('li')):
|
261 |
+
content_parts.append(f"{i+1}. {li.get_text().strip()}\n")
|
262 |
+
content_parts.append("\n")
|
263 |
+
|
264 |
+
# Ajouter les tableaux (version simple)
|
265 |
+
for table in soup.find_all('table'):
|
266 |
+
for tr in table.find_all('tr'):
|
267 |
+
row = []
|
268 |
+
for cell in tr.find_all(['td', 'th']):
|
269 |
+
row.append(cell.get_text().strip())
|
270 |
+
if row:
|
271 |
+
content_parts.append("| " + " | ".join(row) + " |\n")
|
272 |
+
content_parts.append("\n")
|
273 |
+
|
274 |
+
# Ajouter les citations
|
275 |
+
for blockquote in soup.find_all('blockquote'):
|
276 |
+
lines = blockquote.get_text().strip().split('\n')
|
277 |
+
for line in lines:
|
278 |
+
if line.strip():
|
279 |
+
content_parts.append(f"> {line.strip()}\n")
|
280 |
+
content_parts.append("\n")
|
281 |
+
|
282 |
+
# Ajouter les blocs de code
|
283 |
+
for pre in soup.find_all('pre'):
|
284 |
+
content_parts.append("```\n")
|
285 |
+
content_parts.append(pre.get_text().strip() + "\n")
|
286 |
+
content_parts.append("```\n\n")
|
287 |
+
|
288 |
+
# Ajouter les images
|
289 |
+
for img in soup.find_all('img'):
|
290 |
+
alt = img.get('alt', '')
|
291 |
+
src = img.get('src', '')
|
292 |
+
if src:
|
293 |
+
content_parts.append(f"\n\n")
|
294 |
+
|
295 |
+
# Ajouter les liens
|
296 |
+
for a in soup.find_all('a'):
|
297 |
+
text = a.get_text().strip()
|
298 |
+
href = a.get('href', '')
|
299 |
+
if href and text:
|
300 |
+
content_parts.append(f"[{text}]({href})\n\n")
|
301 |
+
|
302 |
+
# Autres blocs de texte significatifs
|
303 |
+
for div in soup.find_all(['div', 'article', 'section', 'main']):
|
304 |
+
# Éviter les div qui contiennent déjà des éléments traités
|
305 |
+
if not div.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):
|
306 |
+
text = div.get_text().strip()
|
307 |
+
if len(text) > 100: # Contenu significatif
|
308 |
+
content_parts.append(f"{text}\n\n")
|
309 |
+
|
310 |
+
markdown_content_2 = ''.join(content_parts)
|
311 |
+
|
312 |
+
# Approche 3: Extraction de texte brut en dernier recours
|
313 |
+
if not markdown_content_2 or len(markdown_content_2) < 200:
|
314 |
+
markdown_content_3 = soup.get_text(separator='\n\n', strip=True)
|
315 |
+
# Nettoyer et structurer le texte brut
|
316 |
+
paragraphs = [p.strip() for p in markdown_content_3.split('\n\n') if p.strip()]
|
317 |
+
markdown_content_3 = '\n\n'.join(paragraphs)
|
318 |
+
|
319 |
+
# Si cette approche donne un meilleur résultat, l'utiliser
|
320 |
+
if len(markdown_content_3) > len(markdown_content_2):
|
321 |
+
markdown_content_2 = markdown_content_3
|
322 |
+
|
323 |
+
# Nettoyer le résultat final
|
324 |
+
markdown_content_2 = self.clean_markdown(markdown_content_2)
|
325 |
+
|
326 |
+
# Sélectionner la meilleure approche
|
327 |
+
if len(markdown_content_1) > len(markdown_content_2) and '<' not in markdown_content_1:
|
328 |
+
return markdown_content_1
|
329 |
+
else:
|
330 |
+
return markdown_content_2
|
331 |
+
|
332 |
+
except Exception as e:
|
333 |
+
logger.error(f"Erreur lors de la conversion en Markdown: {str(e)}")
|
334 |
+
# Fallback: extraction simple du texte
|
335 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
336 |
+
text = soup.get_text(separator='\n\n', strip=True)
|
337 |
+
return self.clean_markdown(text)
|
338 |
+
|
339 |
+
def save_markdown(self, markdown_content: str, filepath: str) -> bool:
|
340 |
+
"""
|
341 |
+
Enregistre le contenu Markdown dans un fichier.
|
342 |
+
|
343 |
+
Args:
|
344 |
+
markdown_content: Le contenu Markdown
|
345 |
+
filepath: Chemin où sauvegarder le fichier
|
346 |
+
|
347 |
+
Returns:
|
348 |
+
True si la sauvegarde a réussi, False sinon
|
349 |
+
"""
|
350 |
+
try:
|
351 |
+
# S'assurer que le répertoire existe
|
352 |
+
os.makedirs(os.path.dirname(os.path.abspath(filepath)), exist_ok=True)
|
353 |
+
|
354 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
355 |
+
f.write(markdown_content)
|
356 |
+
|
357 |
+
logger.info(f"Contenu Markdown sauvegardé avec succès dans {filepath}")
|
358 |
+
return True
|
359 |
+
except Exception as e:
|
360 |
+
logger.error(f"Erreur lors de la sauvegarde du fichier Markdown: {str(e)}")
|
361 |
+
return False
|
362 |
+
|
363 |
+
def markdown_to_html(self, markdown_content: str) -> str:
|
364 |
+
"""
|
365 |
+
Convertit le Markdown en HTML (utile pour la prévisualisation).
|
366 |
+
|
367 |
+
Args:
|
368 |
+
markdown_content: Le contenu Markdown
|
369 |
+
|
370 |
+
Returns:
|
371 |
+
Contenu au format HTML
|
372 |
+
"""
|
373 |
+
try:
|
374 |
+
return markdown.markdown(markdown_content, extensions=['tables', 'fenced_code'])
|
375 |
+
except Exception as e:
|
376 |
+
logger.error(f"Erreur lors de la conversion du Markdown en HTML: {str(e)}")
|
377 |
+
return f"<pre>{markdown_content}</pre>"
|
378 |
+
|
379 |
+
|
380 |
+
# Fonctions utilitaires pour une utilisation rapide
|
381 |
+
def html_to_markdown(html_content: str, url: Optional[str] = None) -> str:
|
382 |
+
"""
|
383 |
+
Fonction utilitaire pour convertir HTML en Markdown.
|
384 |
+
|
385 |
+
Args:
|
386 |
+
html_content: Le contenu HTML
|
387 |
+
url: L'URL source pour résoudre les liens relatifs
|
388 |
+
|
389 |
+
Returns:
|
390 |
+
Contenu au format Markdown
|
391 |
+
"""
|
392 |
+
converter = MarkdownConverter(base_url=url)
|
393 |
+
return converter.html_to_markdown(html_content, url)
|
394 |
+
|
395 |
+
def save_markdown(markdown_content: str, filepath: str) -> bool:
|
396 |
+
"""
|
397 |
+
Fonction utilitaire pour sauvegarder du Markdown dans un fichier.
|
398 |
+
|
399 |
+
Args:
|
400 |
+
markdown_content: Le contenu Markdown
|
401 |
+
filepath: Chemin où sauvegarder le fichier
|
402 |
+
|
403 |
+
Returns:
|
404 |
+
True si la sauvegarde a réussi, False sinon
|
405 |
+
"""
|
406 |
+
converter = MarkdownConverter()
|
407 |
+
return converter.save_markdown(markdown_content, filepath)
|
src/web2llm/app/main.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module principal de l'application Web Scraper et Convertisseur Markdown.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
from typing import Dict, Optional, Union, List
|
8 |
+
from urllib.parse import urlparse
|
9 |
+
import pathlib
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
from src.web2llm.app.scraper.scraper import WebScraper
|
13 |
+
from src.web2llm.app.converter.converter import MarkdownConverter
|
14 |
+
|
15 |
+
# Configuration du logging
|
16 |
+
logging.basicConfig(level=logging.INFO,
|
17 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
# Chargement des variables d'environnement
|
21 |
+
load_dotenv()
|
22 |
+
|
23 |
+
# Configuration
|
24 |
+
OUTPUT_DIR = os.getenv('OUTPUT_DIR', './output')
|
25 |
+
DEFAULT_FILENAME = os.getenv('DEFAULT_FILENAME', 'scraped_content')
|
26 |
+
|
27 |
+
class WebToMarkdown:
|
28 |
+
"""Classe principale combinant le scraping et la conversion en Markdown."""
|
29 |
+
|
30 |
+
def __init__(self, output_dir: str = OUTPUT_DIR):
|
31 |
+
"""
|
32 |
+
Initialise l'outil.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
output_dir: Répertoire où sauvegarder les fichiers Markdown
|
36 |
+
"""
|
37 |
+
self.scraper = WebScraper()
|
38 |
+
self.converter = MarkdownConverter()
|
39 |
+
self.output_dir = output_dir
|
40 |
+
|
41 |
+
# S'assurer que le répertoire de sortie existe
|
42 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
43 |
+
|
44 |
+
def generate_filename(self, url: str, title: Optional[str] = None, extension: str = '.md') -> str:
|
45 |
+
"""
|
46 |
+
Génère un nom de fichier valide à partir de l'URL ou du titre.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
url: L'URL de la page
|
50 |
+
title: Le titre de la page (optionnel)
|
51 |
+
extension: L'extension du fichier (.md par défaut)
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
Un nom de fichier valide
|
55 |
+
"""
|
56 |
+
if title:
|
57 |
+
# Nettoyer le titre pour en faire un nom de fichier valide
|
58 |
+
safe_title = "".join([c if c.isalnum() or c in [' ', '-', '_'] else "_" for c in title])
|
59 |
+
safe_title = safe_title.strip()
|
60 |
+
filename = safe_title[:100] # Limiter la longueur mais permettre des noms plus longs
|
61 |
+
else:
|
62 |
+
# Utiliser l'URL
|
63 |
+
parsed_url = urlparse(url)
|
64 |
+
hostname = parsed_url.netloc
|
65 |
+
path = parsed_url.path.strip('/')
|
66 |
+
filename = f"{hostname}_{path}".replace('/', '_')
|
67 |
+
|
68 |
+
# Remplacer les espaces par des tirets
|
69 |
+
filename = filename.replace(' ', '-')
|
70 |
+
|
71 |
+
# S'assurer que le nom se termine par l'extension spécifiée
|
72 |
+
if not filename.endswith(extension):
|
73 |
+
filename += extension
|
74 |
+
|
75 |
+
return filename
|
76 |
+
|
77 |
+
def save_raw_html(self, html_content: str, filepath: str) -> bool:
|
78 |
+
"""
|
79 |
+
Sauvegarde le contenu HTML brut dans un fichier.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
html_content: Le contenu HTML
|
83 |
+
filepath: Chemin où sauvegarder le fichier
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
True si la sauvegarde a réussi, False sinon
|
87 |
+
"""
|
88 |
+
try:
|
89 |
+
# S'assurer que le répertoire existe
|
90 |
+
os.makedirs(os.path.dirname(os.path.abspath(filepath)), exist_ok=True)
|
91 |
+
|
92 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
93 |
+
f.write(html_content)
|
94 |
+
|
95 |
+
logger.info(f"Contenu HTML sauvegardé avec succès dans {filepath}")
|
96 |
+
return True
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"Erreur lors de la sauvegarde du fichier HTML: {str(e)}")
|
99 |
+
return False
|
100 |
+
|
101 |
+
def process_url(self, url: str, save: bool = False,
|
102 |
+
filename: Optional[str] = None) -> Dict[str, Union[str, None, bool]]:
|
103 |
+
"""
|
104 |
+
Traite une URL: scraping, nettoyage et conversion en Markdown.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
url: L'URL à traiter
|
108 |
+
save: Si True, sauvegarde le résultat dans un fichier
|
109 |
+
filename: Nom du fichier pour la sauvegarde
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
Dictionnaire avec les résultats et le statut
|
113 |
+
"""
|
114 |
+
result = {
|
115 |
+
"url": url,
|
116 |
+
"title": None,
|
117 |
+
"markdown": None,
|
118 |
+
"saved": False,
|
119 |
+
"saved_path": None,
|
120 |
+
"success": False,
|
121 |
+
"error": None,
|
122 |
+
"html_saved": False,
|
123 |
+
"html_saved_path": None
|
124 |
+
}
|
125 |
+
|
126 |
+
try:
|
127 |
+
# Définir l'URL de base pour la conversion des liens relatifs
|
128 |
+
self.converter.base_url = url
|
129 |
+
|
130 |
+
# Scraper l'URL
|
131 |
+
logger.info(f"Scraping de l'URL: {url}")
|
132 |
+
scraped_data = self.scraper.scrape(url, clean=True, extract_text=True)
|
133 |
+
|
134 |
+
# Stocker le titre
|
135 |
+
result["title"] = scraped_data["title"]
|
136 |
+
|
137 |
+
if not scraped_data["clean_html"]:
|
138 |
+
result["error"] = "Impossible de récupérer ou nettoyer le contenu HTML"
|
139 |
+
return result
|
140 |
+
|
141 |
+
# Conversion en Markdown
|
142 |
+
logger.info("Conversion du HTML en Markdown")
|
143 |
+
markdown_content = self.converter.html_to_markdown(
|
144 |
+
scraped_data["clean_html"], url)
|
145 |
+
|
146 |
+
# Vérifier si la conversion a produit un résultat significatif
|
147 |
+
if not markdown_content or len(markdown_content) < 100:
|
148 |
+
logger.warning("Conversion en Markdown insuffisante, tentative avec le texte brut")
|
149 |
+
|
150 |
+
# Si le texte brut est disponible, l'utiliser comme alternative
|
151 |
+
if scraped_data["text_content"]:
|
152 |
+
markdown_content = scraped_data["text_content"]
|
153 |
+
else:
|
154 |
+
# Dernière tentative: extraire le texte à partir du HTML nettoyé
|
155 |
+
from bs4 import BeautifulSoup
|
156 |
+
soup = BeautifulSoup(scraped_data["clean_html"], 'html.parser')
|
157 |
+
markdown_content = soup.get_text(separator='\n\n', strip=True)
|
158 |
+
|
159 |
+
# Mise à jour du résultat
|
160 |
+
result["markdown"] = markdown_content
|
161 |
+
result["success"] = True
|
162 |
+
|
163 |
+
# Sauvegarde si demandée
|
164 |
+
if save:
|
165 |
+
# Générer un nom de fichier si non spécifié
|
166 |
+
if not filename:
|
167 |
+
filename = self.generate_filename(url, result["title"])
|
168 |
+
# S'assurer que l'extension est .md
|
169 |
+
elif not filename.endswith('.md'):
|
170 |
+
filename += '.md'
|
171 |
+
|
172 |
+
filepath = os.path.join(self.output_dir, filename)
|
173 |
+
|
174 |
+
# Enregistrer le fichier Markdown
|
175 |
+
saved = self.converter.save_markdown(markdown_content, filepath)
|
176 |
+
result["saved"] = saved
|
177 |
+
result["saved_path"] = filepath if saved else None
|
178 |
+
|
179 |
+
# Si la conversion en Markdown n'est pas optimale, sauvegarder aussi le HTML
|
180 |
+
if len(markdown_content) < 500 or "<" in markdown_content:
|
181 |
+
html_filename = filename.replace('.md', '.html')
|
182 |
+
html_filepath = os.path.join(self.output_dir, html_filename)
|
183 |
+
html_saved = self.save_raw_html(scraped_data["clean_html"], html_filepath)
|
184 |
+
result["html_saved"] = html_saved
|
185 |
+
result["html_saved_path"] = html_filepath if html_saved else None
|
186 |
+
|
187 |
+
if html_saved:
|
188 |
+
logger.info(f"Le HTML a été sauvegardé en complément dans {html_filepath}")
|
189 |
+
|
190 |
+
return result
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"Erreur lors du traitement de l'URL {url}: {str(e)}")
|
194 |
+
result["error"] = str(e)
|
195 |
+
|
196 |
+
# En cas d'erreur, tenter de sauvegarder le HTML brut si disponible
|
197 |
+
if save and scraped_data and "raw_html" in scraped_data and scraped_data["raw_html"]:
|
198 |
+
if not filename:
|
199 |
+
filename = self.generate_filename(url, result["title"], '.html')
|
200 |
+
else:
|
201 |
+
filename = filename.replace('.md', '.html')
|
202 |
+
|
203 |
+
html_filepath = os.path.join(self.output_dir, filename)
|
204 |
+
html_saved = self.save_raw_html(scraped_data["raw_html"], html_filepath)
|
205 |
+
|
206 |
+
result["html_saved"] = html_saved
|
207 |
+
result["html_saved_path"] = html_filepath if html_saved else None
|
208 |
+
|
209 |
+
if html_saved:
|
210 |
+
logger.info(f"Sauvegarde de secours du HTML brut dans {html_filepath}")
|
211 |
+
|
212 |
+
return result
|
213 |
+
|
214 |
+
def process_multiple_urls(self, urls: List[str], save: bool = True) -> Dict[str, List[Dict]]:
|
215 |
+
"""
|
216 |
+
Traite plusieurs URLs en parallèle.
|
217 |
+
|
218 |
+
Args:
|
219 |
+
urls: Liste d'URLs à traiter
|
220 |
+
save: Si True, sauvegarde les résultats
|
221 |
+
|
222 |
+
Returns:
|
223 |
+
Dictionnaire contenant les résultats pour chaque URL
|
224 |
+
"""
|
225 |
+
results = []
|
226 |
+
|
227 |
+
for url in urls:
|
228 |
+
result = self.process_url(url, save=save)
|
229 |
+
results.append(result)
|
230 |
+
|
231 |
+
return {
|
232 |
+
"total": len(urls),
|
233 |
+
"success": sum(1 for r in results if r["success"]),
|
234 |
+
"results": results
|
235 |
+
}
|
236 |
+
|
237 |
+
|
238 |
+
# Fonction pour une utilisation rapide en ligne de commande
|
239 |
+
def process_url(url: str, save: bool = False, filename: Optional[str] = None) -> Dict:
|
240 |
+
"""
|
241 |
+
Fonction utilitaire pour traiter rapidement une URL.
|
242 |
+
|
243 |
+
Args:
|
244 |
+
url: L'URL à traiter
|
245 |
+
save: Si True, sauvegarde le résultat
|
246 |
+
filename: Nom du fichier pour la sauvegarde
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
Dictionnaire avec les résultats
|
250 |
+
"""
|
251 |
+
processor = WebToMarkdown()
|
252 |
+
return processor.process_url(url, save, filename)
|
253 |
+
|
254 |
+
|
255 |
+
if __name__ == "__main__":
|
256 |
+
import argparse
|
257 |
+
|
258 |
+
parser = argparse.ArgumentParser(description="Scraper et convertisseur Markdown")
|
259 |
+
parser.add_argument("url", help="URL à scraper")
|
260 |
+
parser.add_argument("--save", action="store_true", help="Sauvegarder en fichier Markdown")
|
261 |
+
parser.add_argument("--output", help="Nom du fichier de sortie")
|
262 |
+
parser.add_argument("--dir", help="Répertoire de sortie", default=OUTPUT_DIR)
|
263 |
+
|
264 |
+
args = parser.parse_args()
|
265 |
+
|
266 |
+
processor = WebToMarkdown(output_dir=args.dir)
|
267 |
+
result = processor.process_url(args.url, save=args.save, filename=args.output)
|
268 |
+
|
269 |
+
if result["success"]:
|
270 |
+
print(f"Titre: {result['title']}")
|
271 |
+
print("\nContenu Markdown:")
|
272 |
+
print("-------------------")
|
273 |
+
print(result["markdown"][:500] + "..." if len(result["markdown"]) > 500 else result["markdown"])
|
274 |
+
|
275 |
+
if result["saved"]:
|
276 |
+
print(f"\nFichier sauvegardé: {result['saved_path']}")
|
277 |
+
if result["html_saved"]:
|
278 |
+
print(f"\nFichier HTML sauvegardé: {result['html_saved_path']}")
|
279 |
+
else:
|
280 |
+
print(f"Erreur: {result['error']}")
|
281 |
+
if result["html_saved"]:
|
282 |
+
print(f"\nFichier HTML de secours sauvegardé: {result['html_saved_path']}")
|
src/web2llm/app/scraper/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module de scraping.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from src.web2llm.app.scraper.scraper import WebScraper, scrape_url
|
6 |
+
|
7 |
+
__all__ = ['WebScraper', 'scrape_url']
|
src/web2llm/app/scraper/scraper.py
ADDED
@@ -0,0 +1,475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module de scraping pour extraire le contenu des pages web.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
from typing import Dict, Optional, Union, List
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from readability import Document
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
import re
|
12 |
+
|
13 |
+
# Configuration du logging
|
14 |
+
logging.basicConfig(level=logging.INFO,
|
15 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
# Chargement des variables d'environnement
|
19 |
+
load_dotenv()
|
20 |
+
|
21 |
+
# Configuration par défaut
|
22 |
+
DEFAULT_USER_AGENT = os.getenv(
|
23 |
+
'USER_AGENT',
|
24 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
25 |
+
)
|
26 |
+
DEFAULT_TIMEOUT = int(os.getenv('REQUEST_TIMEOUT', 30))
|
27 |
+
DEFAULT_MAX_RETRIES = int(os.getenv('MAX_RETRIES', 3))
|
28 |
+
|
29 |
+
class WebScraper:
|
30 |
+
"""Classe pour scraper des pages web et nettoyer leur contenu."""
|
31 |
+
|
32 |
+
def __init__(self, user_agent: str = DEFAULT_USER_AGENT,
|
33 |
+
timeout: int = DEFAULT_TIMEOUT,
|
34 |
+
max_retries: int = DEFAULT_MAX_RETRIES):
|
35 |
+
"""
|
36 |
+
Initialise le scraper.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
user_agent: User-Agent à utiliser pour les requêtes HTTP
|
40 |
+
timeout: Délai d'attente en secondes pour les requêtes
|
41 |
+
max_retries: Nombre maximal de tentatives en cas d'échec
|
42 |
+
"""
|
43 |
+
self.user_agent = user_agent
|
44 |
+
self.timeout = timeout
|
45 |
+
self.max_retries = max_retries
|
46 |
+
self.session = requests.Session()
|
47 |
+
self.session.headers.update({"User-Agent": self.user_agent})
|
48 |
+
|
49 |
+
def fetch_url(self, url: str) -> Optional[str]:
|
50 |
+
"""
|
51 |
+
Récupère le contenu HTML d'une URL.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
url: L'URL à scraper
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
Le contenu HTML ou None en cas d'échec
|
58 |
+
"""
|
59 |
+
for attempt in range(self.max_retries):
|
60 |
+
try:
|
61 |
+
logger.info(f"Tentative {attempt + 1}/{self.max_retries} de récupération de {url}")
|
62 |
+
response = self.session.get(url, timeout=self.timeout)
|
63 |
+
response.raise_for_status()
|
64 |
+
|
65 |
+
# Détection de l'encodage
|
66 |
+
encoding = response.encoding
|
67 |
+
|
68 |
+
# Si le site ne spécifie pas d'encodage ou qu'il est incorrect, essayer de le détecter
|
69 |
+
if encoding == 'ISO-8859-1' or not encoding:
|
70 |
+
detected_encoding = response.apparent_encoding
|
71 |
+
if detected_encoding:
|
72 |
+
response.encoding = detected_encoding
|
73 |
+
|
74 |
+
return response.text
|
75 |
+
except requests.RequestException as e:
|
76 |
+
logger.error(f"Erreur lors de la récupération de {url}: {str(e)}")
|
77 |
+
if attempt == self.max_retries - 1:
|
78 |
+
logger.error(f"Échec après {self.max_retries} tentatives.")
|
79 |
+
return None
|
80 |
+
return None
|
81 |
+
|
82 |
+
def extract_additional_content(self, soup: BeautifulSoup) -> str:
|
83 |
+
"""
|
84 |
+
Extrait du contenu supplémentaire qui pourrait être ignoré par Readability.
|
85 |
+
|
86 |
+
Args:
|
87 |
+
soup: Objet BeautifulSoup contenant la page HTML
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
Contenu HTML supplémentaire
|
91 |
+
"""
|
92 |
+
additional_html = ""
|
93 |
+
|
94 |
+
# Rechercher des sections de contenu courantes qui pourraient être manquées
|
95 |
+
content_selectors = [
|
96 |
+
'article', '.article', '.post', '.content', '.main-content',
|
97 |
+
'main', '#main', '#content', '.body', '.entry-content',
|
98 |
+
'.page-content', '[role="main"]', '[itemprop="articleBody"]',
|
99 |
+
'.blog-post', '.text', '.publication-content', '.story'
|
100 |
+
]
|
101 |
+
|
102 |
+
for selector in content_selectors:
|
103 |
+
elements = soup.select(selector)
|
104 |
+
if elements:
|
105 |
+
for element in elements:
|
106 |
+
additional_html += str(element)
|
107 |
+
|
108 |
+
# Si aucun contenu n'a été trouvé avec les sélecteurs, essayer d'autres méthodes
|
109 |
+
if not additional_html:
|
110 |
+
# Obtenir tous les paragraphes qui ont un contenu substantiel
|
111 |
+
paragraphs = []
|
112 |
+
for p in soup.find_all('p'):
|
113 |
+
text = p.get_text().strip()
|
114 |
+
# Considérer uniquement les paragraphes avec un contenu significatif
|
115 |
+
if len(text) > 50: # Paragraphes d'au moins 50 caractères
|
116 |
+
paragraphs.append(str(p))
|
117 |
+
|
118 |
+
if paragraphs:
|
119 |
+
additional_html = "\n".join(paragraphs)
|
120 |
+
|
121 |
+
return additional_html
|
122 |
+
|
123 |
+
def remove_headers_footers(self, soup: BeautifulSoup) -> BeautifulSoup:
|
124 |
+
"""
|
125 |
+
Supprime les headers, footers, scripts, styles et autres éléments non désirés des pages web,
|
126 |
+
avec une approche plus modérée pour préserver davantage de contenu.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
soup: L'objet BeautifulSoup contenant le HTML
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
L'objet BeautifulSoup nettoyé
|
133 |
+
"""
|
134 |
+
# Liste des sélecteurs pour les headers et footers courants - version allégée
|
135 |
+
header_selectors = [
|
136 |
+
'header', '#header', '.header', '.site-header',
|
137 |
+
'.masthead', '[role="banner"]'
|
138 |
+
]
|
139 |
+
|
140 |
+
footer_selectors = [
|
141 |
+
'footer', '#footer', '.footer', '.site-footer',
|
142 |
+
'.copyright', '[role="contentinfo"]'
|
143 |
+
]
|
144 |
+
|
145 |
+
# Sélecteurs essentiels pour les navbars
|
146 |
+
navbar_selectors = [
|
147 |
+
'nav', '.navbar', '.main-nav',
|
148 |
+
'#navbar', '#navigation', '#menu',
|
149 |
+
'[role="navigation"]'
|
150 |
+
]
|
151 |
+
|
152 |
+
# Sélecteurs essentiels pour les sidebars
|
153 |
+
sidebar_selectors = [
|
154 |
+
'aside', '.sidebar', '#sidebar',
|
155 |
+
'[role="complementary"]'
|
156 |
+
]
|
157 |
+
|
158 |
+
# Éléments non désirés les plus courants et intrusifs
|
159 |
+
unwanted_selectors = [
|
160 |
+
'.ads', '.advertisement', '.banner', '.cookie-notice',
|
161 |
+
'.popup', '.modal', '.newsletter-signup',
|
162 |
+
'.cookie-banner', '.adsbygoogle', '.ad-container',
|
163 |
+
'.gdpr'
|
164 |
+
]
|
165 |
+
|
166 |
+
# Combiner tous les sélecteurs
|
167 |
+
all_selectors = header_selectors + footer_selectors + navbar_selectors + sidebar_selectors + unwanted_selectors
|
168 |
+
|
169 |
+
# Supprimer tous ces éléments
|
170 |
+
for selector in all_selectors:
|
171 |
+
for element in soup.select(selector):
|
172 |
+
# Vérifier si l'élément contient du contenu significatif
|
173 |
+
text_content = element.get_text(strip=True)
|
174 |
+
|
175 |
+
# Ignorer les éléments avec beaucoup de contenu textuel
|
176 |
+
# (probablement du contenu principal mal classé)
|
177 |
+
if len(text_content) > 1000 and selector not in ['.ads', '.advertisement', '.cookie-notice', '.popup', '.modal']:
|
178 |
+
# Ne pas supprimer - contient trop de contenu pour être juste un élément de navigation
|
179 |
+
continue
|
180 |
+
|
181 |
+
element.decompose()
|
182 |
+
|
183 |
+
# Supprimer tous les scripts
|
184 |
+
for script in soup.find_all('script'):
|
185 |
+
script.decompose()
|
186 |
+
|
187 |
+
# Supprimer tous les styles CSS
|
188 |
+
for style in soup.find_all('style'):
|
189 |
+
style.decompose()
|
190 |
+
|
191 |
+
# Supprimer tous les noscript
|
192 |
+
for noscript in soup.find_all('noscript'):
|
193 |
+
noscript.decompose()
|
194 |
+
|
195 |
+
# Supprimer tous les iframes
|
196 |
+
for iframe in soup.find_all('iframe'):
|
197 |
+
iframe.decompose()
|
198 |
+
|
199 |
+
# Supprimer les attributs de style, onclick, onload, etc.
|
200 |
+
for tag in soup.find_all(True):
|
201 |
+
# Créer une liste des attributs à supprimer
|
202 |
+
attrs_to_remove = []
|
203 |
+
for attr in tag.attrs:
|
204 |
+
# Supprimer les attributs de style
|
205 |
+
if attr == 'style':
|
206 |
+
attrs_to_remove.append(attr)
|
207 |
+
# Supprimer les gestionnaires d'événements JavaScript (onclick, onload, etc.)
|
208 |
+
elif attr.startswith('on'):
|
209 |
+
attrs_to_remove.append(attr)
|
210 |
+
# Supprimer les classes qui pourraient indiquer des scripts/publicités
|
211 |
+
elif attr == 'class':
|
212 |
+
classes = tag.get('class', [])
|
213 |
+
if any(cls in ' '.join(classes) for cls in ['js-', 'ad-', 'ads-', 'script-', 'tracking']):
|
214 |
+
attrs_to_remove.append(attr)
|
215 |
+
|
216 |
+
# Supprimer les attributs identifiés
|
217 |
+
for attr in attrs_to_remove:
|
218 |
+
del tag[attr]
|
219 |
+
|
220 |
+
return soup
|
221 |
+
|
222 |
+
def detect_nav_by_content(self, soup: BeautifulSoup) -> None:
|
223 |
+
"""
|
224 |
+
Détecte et supprime les éléments de navigation et barres latérales
|
225 |
+
en analysant leur contenu et leur position, de manière moins agressive.
|
226 |
+
|
227 |
+
Args:
|
228 |
+
soup: L'objet BeautifulSoup à nettoyer
|
229 |
+
"""
|
230 |
+
# 1. Détecter les éléments qui contiennent de nombreux liens
|
231 |
+
all_divs = soup.find_all(['div', 'section', 'ul', 'ol'])
|
232 |
+
for element in all_divs:
|
233 |
+
links = element.find_all('a')
|
234 |
+
|
235 |
+
# Si un élément contient beaucoup de liens, c'est probablement un menu ou une barre latérale
|
236 |
+
# Augmenté le seuil de 5 à 8 liens pour être moins agressif
|
237 |
+
if len(links) > 8:
|
238 |
+
# Vérifier si les liens sont courts (typique des menus)
|
239 |
+
short_links = [link for link in links if len(link.get_text(strip=True)) < 20]
|
240 |
+
|
241 |
+
# Augmenté le seuil de 70% à 85% pour être sûr que c'est vraiment un menu
|
242 |
+
if len(short_links) > len(links) * 0.85:
|
243 |
+
# Vérifier s'il contient du texte informatif substantiel
|
244 |
+
text_content = element.get_text(strip=True)
|
245 |
+
# Si le contenu textuel est substantiel par rapport au nombre de liens, ne pas supprimer
|
246 |
+
if len(text_content) > len(links) * 50: # En moyenne 50 caractères de contenu par lien
|
247 |
+
continue
|
248 |
+
element.decompose()
|
249 |
+
continue
|
250 |
+
|
251 |
+
# Vérifier si c'est une liste de catégories, tags, etc.
|
252 |
+
# Liste plus restreinte de termes pour être moins agressif
|
253 |
+
list_terms = ['menu', 'navigation', 'liens', 'links']
|
254 |
+
|
255 |
+
# Vérifier le texte de l'élément pour des indices, plus strict
|
256 |
+
element_text = element.get_text().lower()
|
257 |
+
if any(term in element_text for term in list_terms) and len(links) > 4:
|
258 |
+
# Vérifier la proportion de texte vs liens
|
259 |
+
if len(element_text) < 200: # Seulement supprimer les petits éléments de navigation
|
260 |
+
element.decompose()
|
261 |
+
continue
|
262 |
+
|
263 |
+
# 2. Détecter les éléments par leur position (uniquement la première div)
|
264 |
+
main_content = soup.find('body')
|
265 |
+
if main_content:
|
266 |
+
# Examiner seulement le premier enfant direct du body (souvent la navigation)
|
267 |
+
# Réduit de 3 à 1 pour être moins agressif
|
268 |
+
children = list(main_content.children)
|
269 |
+
if children and len(children) > 0:
|
270 |
+
child = children[0]
|
271 |
+
if child.name in ['div', 'nav'] and not child.find(['h1', 'h2', 'article', 'p']):
|
272 |
+
# Vérifier si c'est probablement une navigation sans contenu substantiel
|
273 |
+
if child.find_all('a', limit=5) and len(child.get_text(strip=True)) < 200:
|
274 |
+
child.decompose()
|
275 |
+
|
276 |
+
# Examiner uniquement le dernier enfant direct du body (souvent le footer)
|
277 |
+
# Réduit à seulement le dernier enfant
|
278 |
+
if len(children) > 0:
|
279 |
+
child = children[-1]
|
280 |
+
if child.name in ['div', 'footer'] and not child.find(['h1', 'h2', 'article']):
|
281 |
+
if 'copyright' in child.get_text().lower() or (
|
282 |
+
child.find_all('a', limit=3) and len(child.get_text(strip=True)) < 150):
|
283 |
+
child.decompose()
|
284 |
+
|
285 |
+
# 3. Supprimer les éléments qui ont une largeur très réduite (sidebars)
|
286 |
+
# Réduit de 40% à 25% pour être moins agressif
|
287 |
+
for element in soup.find_all(True):
|
288 |
+
if 'style' in element.attrs:
|
289 |
+
style = element['style'].lower()
|
290 |
+
if 'width' in style:
|
291 |
+
# Seulement si la largeur est très petite (moins de 25%)
|
292 |
+
width_match = re.search(r'width\s*:\s*(\d+)%', style)
|
293 |
+
if width_match and int(width_match.group(1)) < 25:
|
294 |
+
# Vérifier qu'il s'agit bien d'un élément de navigation
|
295 |
+
if element.find_all('a', limit=4) and not element.find(['p', 'article']) and len(element.get_text(strip=True)) < 300:
|
296 |
+
element.decompose()
|
297 |
+
|
298 |
+
def clean_html(self, html_content: str) -> str:
|
299 |
+
"""
|
300 |
+
Nettoie le HTML en utilisant readability-lxml pour extraire le contenu principal.
|
301 |
+
Version moins agressive pour préserver plus de contenu original.
|
302 |
+
|
303 |
+
Args:
|
304 |
+
html_content: Le contenu HTML brut
|
305 |
+
|
306 |
+
Returns:
|
307 |
+
Le HTML nettoyé avec le contenu principal
|
308 |
+
"""
|
309 |
+
try:
|
310 |
+
# Parser le HTML
|
311 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
312 |
+
|
313 |
+
# Récupérer la longueur du contenu original pour analyse
|
314 |
+
original_content_length = len(soup.get_text(strip=True))
|
315 |
+
|
316 |
+
# Supprimer les headers, footers et autres éléments non désirés
|
317 |
+
soup = self.remove_headers_footers(soup)
|
318 |
+
|
319 |
+
# Récupérer la longueur du contenu après première passe de nettoyage
|
320 |
+
post_header_footer_length = len(soup.get_text(strip=True))
|
321 |
+
|
322 |
+
# Si on a déjà perdu plus de 30% du contenu, on ne fait pas de détection avancée
|
323 |
+
# qui risquerait de trop supprimer de contenu
|
324 |
+
if post_header_footer_length > original_content_length * 0.7:
|
325 |
+
# Détection avancée des éléments de navigation par leur contenu
|
326 |
+
self.detect_nav_by_content(soup)
|
327 |
+
|
328 |
+
# Extraire le titre
|
329 |
+
title = soup.title.string if soup.title else "Sans titre"
|
330 |
+
|
331 |
+
# Utiliser Readability pour extraire le contenu principal
|
332 |
+
doc = Document(html_content)
|
333 |
+
clean_html = doc.summary()
|
334 |
+
readability_title = doc.title()
|
335 |
+
|
336 |
+
# Si le titre de Readability est plus informatif, l'utiliser
|
337 |
+
if readability_title and len(readability_title) > len(title):
|
338 |
+
title = readability_title
|
339 |
+
|
340 |
+
# Parser le HTML nettoyé par Readability
|
341 |
+
clean_soup = BeautifulSoup(clean_html, 'html.parser')
|
342 |
+
|
343 |
+
# Récupérer la longueur du contenu extrait par Readability
|
344 |
+
readability_content_length = len(clean_soup.get_text(strip=True))
|
345 |
+
|
346 |
+
# Nettoyer aussi les headers et footers du contenu extrait par Readability
|
347 |
+
clean_soup = self.remove_headers_footers(clean_soup)
|
348 |
+
|
349 |
+
# Appliquer la détection avancée uniquement si le contenu est conséquent
|
350 |
+
# et on ne veut pas trop perdre de contenu
|
351 |
+
if readability_content_length > 1000:
|
352 |
+
self.detect_nav_by_content(clean_soup)
|
353 |
+
|
354 |
+
# Vérifier si le contenu extrait est suffisant
|
355 |
+
clean_text = clean_soup.get_text()
|
356 |
+
if len(clean_text) < 500: # Si moins de 500 caractères, c'est probablement incomplet
|
357 |
+
# Extraire du contenu supplémentaire
|
358 |
+
additional_content = self.extract_additional_content(soup)
|
359 |
+
if additional_content:
|
360 |
+
# Ajouter ce contenu au HTML nettoyé
|
361 |
+
additional_soup = BeautifulSoup(additional_content, 'html.parser')
|
362 |
+
|
363 |
+
# Nettoyer également ce contenu supplémentaire
|
364 |
+
additional_soup = self.remove_headers_footers(additional_soup)
|
365 |
+
self.detect_nav_by_content(additional_soup)
|
366 |
+
|
367 |
+
# Créer un nouvel élément div pour contenir le contenu supplémentaire
|
368 |
+
div = BeautifulSoup("<div class='additional-content'></div>", 'html.parser')
|
369 |
+
div_tag = div.div
|
370 |
+
|
371 |
+
# Ajouter chaque élément de contenu supplémentaire
|
372 |
+
for element in additional_soup.children:
|
373 |
+
if element.name: # Ignorer les nœuds de texte
|
374 |
+
div_tag.append(element)
|
375 |
+
|
376 |
+
clean_soup.body.append(div_tag)
|
377 |
+
clean_html = str(clean_soup)
|
378 |
+
|
379 |
+
# Construire un HTML propre avec le titre et le contenu
|
380 |
+
full_html = f"<html><head><title>{title}</title></head><body><h1>{title}</h1>{clean_html}</body></html>"
|
381 |
+
|
382 |
+
return full_html
|
383 |
+
except Exception as e:
|
384 |
+
logger.error(f"Erreur lors du nettoyage du HTML: {str(e)}")
|
385 |
+
# En cas d'erreur, retourner le HTML original
|
386 |
+
return html_content
|
387 |
+
|
388 |
+
def get_text_content(self, html_content: str) -> str:
|
389 |
+
"""
|
390 |
+
Extrait le texte brut à partir du HTML.
|
391 |
+
|
392 |
+
Args:
|
393 |
+
html_content: Le contenu HTML
|
394 |
+
|
395 |
+
Returns:
|
396 |
+
Le texte extrait sans balises HTML
|
397 |
+
"""
|
398 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
399 |
+
|
400 |
+
# Supprimer les scripts et styles qui ne contiennent pas de contenu utile
|
401 |
+
for script_or_style in soup(['script', 'style', 'meta', 'noscript']):
|
402 |
+
script_or_style.decompose()
|
403 |
+
|
404 |
+
# Obtenir le texte avec des sauts de ligne entre les éléments
|
405 |
+
text = soup.get_text(separator='\n', strip=True)
|
406 |
+
|
407 |
+
# Nettoyer les sauts de ligne multiples
|
408 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
409 |
+
|
410 |
+
return text
|
411 |
+
|
412 |
+
def scrape(self, url: str, clean: bool = True, extract_text: bool = False) -> Dict[str, Union[str, None]]:
|
413 |
+
"""
|
414 |
+
Scrape une URL et retourne différentes versions du contenu.
|
415 |
+
|
416 |
+
Args:
|
417 |
+
url: L'URL à scraper
|
418 |
+
clean: Si True, nettoie le HTML
|
419 |
+
extract_text: Si True, extrait également le texte brut
|
420 |
+
|
421 |
+
Returns:
|
422 |
+
Dictionnaire contenant les différentes formes du contenu
|
423 |
+
"""
|
424 |
+
result = {
|
425 |
+
"url": url,
|
426 |
+
"raw_html": None,
|
427 |
+
"clean_html": None,
|
428 |
+
"text_content": None,
|
429 |
+
"title": None,
|
430 |
+
}
|
431 |
+
|
432 |
+
# Récupération du HTML
|
433 |
+
html_content = self.fetch_url(url)
|
434 |
+
if not html_content:
|
435 |
+
return result
|
436 |
+
|
437 |
+
result["raw_html"] = html_content
|
438 |
+
|
439 |
+
# Extraction du titre
|
440 |
+
try:
|
441 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
442 |
+
result["title"] = soup.title.string.strip() if soup.title else None
|
443 |
+
except Exception as e:
|
444 |
+
logger.error(f"Erreur lors de l'extraction du titre: {str(e)}")
|
445 |
+
pass
|
446 |
+
|
447 |
+
# Nettoyage du HTML si demandé
|
448 |
+
if clean:
|
449 |
+
result["clean_html"] = self.clean_html(html_content)
|
450 |
+
|
451 |
+
# Extraction du texte si demandé
|
452 |
+
if extract_text:
|
453 |
+
if result["clean_html"]:
|
454 |
+
result["text_content"] = self.get_text_content(result["clean_html"])
|
455 |
+
else:
|
456 |
+
result["text_content"] = self.get_text_content(html_content)
|
457 |
+
|
458 |
+
return result
|
459 |
+
|
460 |
+
|
461 |
+
# Fonction pratique pour une utilisation rapide
|
462 |
+
def scrape_url(url: str, clean: bool = True, extract_text: bool = False) -> Dict[str, Union[str, None]]:
|
463 |
+
"""
|
464 |
+
Fonction utilitaire pour scraper rapidement une URL.
|
465 |
+
|
466 |
+
Args:
|
467 |
+
url: L'URL à scraper
|
468 |
+
clean: Si True, nettoie le HTML
|
469 |
+
extract_text: Si True, extrait également le texte brut
|
470 |
+
|
471 |
+
Returns:
|
472 |
+
Dictionnaire contenant les différentes formes du contenu
|
473 |
+
"""
|
474 |
+
scraper = WebScraper()
|
475 |
+
return scraper.scrape(url, clean, extract_text)
|
src/web2llm/app/utils/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module d'utilitaires pour le scraper et le convertisseur.
|
3 |
+
"""
|
tools.json
CHANGED
@@ -17,5 +17,47 @@
|
|
17 |
]
|
18 |
}
|
19 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
}
|
21 |
]
|
|
|
17 |
]
|
18 |
}
|
19 |
}
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"type": "function",
|
23 |
+
"function": {
|
24 |
+
"name": "retrieve_knowledge",
|
25 |
+
"description": "Retrieves knowledge from a database with a provided query.",
|
26 |
+
"parameters": {
|
27 |
+
"type": "object",
|
28 |
+
"properties": {
|
29 |
+
"query": {
|
30 |
+
"type": "string",
|
31 |
+
"description": "The query to search for in the vector store."
|
32 |
+
},
|
33 |
+
"n_results": {
|
34 |
+
"type": "integer",
|
35 |
+
"description": "The number of results to return. Default is 1."
|
36 |
+
}
|
37 |
+
},
|
38 |
+
"required": [
|
39 |
+
"query"
|
40 |
+
]
|
41 |
+
}
|
42 |
+
}
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"type": "function",
|
46 |
+
"function": {
|
47 |
+
"name": "visit_webpage",
|
48 |
+
"description": "Visits a webpage at the given URL and reads its content as a markdown string.",
|
49 |
+
"parameters": {
|
50 |
+
"type": "object",
|
51 |
+
"properties": {
|
52 |
+
"url": {
|
53 |
+
"type": "string",
|
54 |
+
"description": "The URL of the webpage to visit."
|
55 |
+
}
|
56 |
+
},
|
57 |
+
"required": [
|
58 |
+
"url"
|
59 |
+
]
|
60 |
+
}
|
61 |
+
}
|
62 |
}
|
63 |
]
|