Spaces:
Running
Running
Moved to semantic splitting and trafilatura gor html convertion
Browse files- search_agent.py +35 -37
search_agent.py
CHANGED
@@ -33,9 +33,10 @@ from bs4 import BeautifulSoup
|
|
33 |
from docopt import docopt
|
34 |
import dotenv
|
35 |
import pdfplumber
|
|
|
36 |
|
37 |
from langchain_core.documents.base import Document
|
38 |
-
from
|
39 |
from langchain.retrievers.multi_query import MultiQueryRetriever
|
40 |
from langchain.callbacks import LangChainTracer
|
41 |
from langchain_groq import ChatGroq
|
@@ -123,18 +124,19 @@ def get_sources(query, max_pages=10, domain=None):
|
|
123 |
return final_results
|
124 |
|
125 |
except Exception as error:
|
126 |
-
|
127 |
raise
|
128 |
|
129 |
|
130 |
|
131 |
def fetch_with_timeout(url, timeout=8):
|
|
|
132 |
try:
|
133 |
response = requests.get(url, timeout=timeout)
|
134 |
response.raise_for_status()
|
135 |
return response
|
136 |
except requests.RequestException as error:
|
137 |
-
|
138 |
return None
|
139 |
|
140 |
def extract_main_content(html):
|
@@ -152,25 +154,29 @@ def process_source(source):
|
|
152 |
console.log(f"Processing {source['link']}")
|
153 |
if response:
|
154 |
content_type = response.headers.get('Content-Type')
|
155 |
-
if content_type
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
171 |
else:
|
172 |
-
console.log(f"Skipping {source['link']}!
|
173 |
-
return
|
174 |
return None
|
175 |
|
176 |
def get_links_contents(sources):
|
@@ -180,26 +186,17 @@ def get_links_contents(sources):
|
|
180 |
# Filter out None results
|
181 |
return [result for result in results if result is not None]
|
182 |
|
183 |
-
def vectorize(contents, text_chunk_size=
|
184 |
documents = []
|
185 |
for content in contents:
|
186 |
-
page_content = content['snippet']
|
187 |
-
if 'html' in content:
|
188 |
-
page_content = content['html']
|
189 |
-
if 'pdf_content' in content:
|
190 |
-
page_content = content['pdf_content']
|
191 |
try:
|
192 |
metadata = {'title': content['title'], 'source': content['link']}
|
193 |
-
doc = Document(page_content=page_content, metadata=metadata)
|
194 |
documents.append(doc)
|
195 |
except Exception as e:
|
196 |
console.log(f"[gray]Error processing content for {content['link']}: {e}")
|
197 |
-
|
198 |
-
|
199 |
-
chunk_size=text_chunk_size,
|
200 |
-
chunk_overlap=text_chunk_overlap
|
201 |
-
)
|
202 |
-
docs = text_splitter.split_documents(documents)
|
203 |
console.log(f"Vectorizing {len(docs)} document chunks")
|
204 |
embeddings = OpenAIEmbeddings()
|
205 |
store = FAISS.from_documents(docs, embeddings)
|
@@ -231,8 +228,9 @@ def multi_query_rag(chat_llm, question, search_query, vectorstore):
|
|
231 |
|
232 |
|
233 |
def query_rag(chat_llm, question, search_query, vectorstore):
|
234 |
-
retriver = vectorstore.as_retriever()
|
235 |
-
unique_docs = retriver.get_relevant_documents(search_query, callbacks=callbacks, verbose=True)
|
|
|
236 |
context = format_docs(unique_docs)
|
237 |
prompt = get_rag_prompt_template().format(query=question, context=context)
|
238 |
response = chat_llm.invoke(prompt, config={"callbacks": callbacks})
|
@@ -287,7 +285,7 @@ if __name__ == '__main__':
|
|
287 |
vector_store = vectorize(contents)
|
288 |
|
289 |
with console.status("[bold green]Querying LLM relevant context", spinner='dots8Bit'):
|
290 |
-
respomse =
|
291 |
|
292 |
console.rule(f"[bold green]Response from {provider}")
|
293 |
if output == "text":
|
|
|
33 |
from docopt import docopt
|
34 |
import dotenv
|
35 |
import pdfplumber
|
36 |
+
from trafilatura import extract
|
37 |
|
38 |
from langchain_core.documents.base import Document
|
39 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
40 |
from langchain.retrievers.multi_query import MultiQueryRetriever
|
41 |
from langchain.callbacks import LangChainTracer
|
42 |
from langchain_groq import ChatGroq
|
|
|
124 |
return final_results
|
125 |
|
126 |
except Exception as error:
|
127 |
+
console.log('Error fetching search results:', error)
|
128 |
raise
|
129 |
|
130 |
|
131 |
|
132 |
def fetch_with_timeout(url, timeout=8):
|
133 |
+
|
134 |
try:
|
135 |
response = requests.get(url, timeout=timeout)
|
136 |
response.raise_for_status()
|
137 |
return response
|
138 |
except requests.RequestException as error:
|
139 |
+
console.log(f"Skipping {url}! Error: {error}")
|
140 |
return None
|
141 |
|
142 |
def extract_main_content(html):
|
|
|
154 |
console.log(f"Processing {source['link']}")
|
155 |
if response:
|
156 |
content_type = response.headers.get('Content-Type')
|
157 |
+
if content_type:
|
158 |
+
if content_type.startswith('application/pdf'):
|
159 |
+
# The response is a PDF file
|
160 |
+
pdf_content = response.content
|
161 |
+
# Create a file-like object from the bytes
|
162 |
+
pdf_file = io.BytesIO(pdf_content)
|
163 |
+
# Extract text from PDF using pdfplumber
|
164 |
+
with pdfplumber.open(pdf_file) as pdf:
|
165 |
+
text = ""
|
166 |
+
for page in pdf.pages:
|
167 |
+
text += page.extract_text()
|
168 |
+
return {**source, 'page_content': text}
|
169 |
+
elif content_type.startswith('text/html'):
|
170 |
+
# The response is an HTML file
|
171 |
+
html = response.text
|
172 |
+
main_content = extract(html, output_format='txt', include_links=True)
|
173 |
+
return {**source, 'page_content': main_content}
|
174 |
+
else:
|
175 |
+
console.log(f"Skipping {source['link']}! Unsupported content type: {content_type}")
|
176 |
+
return {**source, 'page_content': source['snippet']}
|
177 |
else:
|
178 |
+
console.log(f"Skipping {source['link']}! No content type")
|
179 |
+
return {**source, 'page_content': source['snippet']}
|
180 |
return None
|
181 |
|
182 |
def get_links_contents(sources):
|
|
|
186 |
# Filter out None results
|
187 |
return [result for result in results if result is not None]
|
188 |
|
189 |
+
def vectorize(contents, text_chunk_size=400,text_chunk_overlap=40):
|
190 |
documents = []
|
191 |
for content in contents:
|
|
|
|
|
|
|
|
|
|
|
192 |
try:
|
193 |
metadata = {'title': content['title'], 'source': content['link']}
|
194 |
+
doc = Document(page_content=content['page_content'], metadata=metadata)
|
195 |
documents.append(doc)
|
196 |
except Exception as e:
|
197 |
console.log(f"[gray]Error processing content for {content['link']}: {e}")
|
198 |
+
semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")
|
199 |
+
docs = semantic_chunker.split_documents(documents)
|
|
|
|
|
|
|
|
|
200 |
console.log(f"Vectorizing {len(docs)} document chunks")
|
201 |
embeddings = OpenAIEmbeddings()
|
202 |
store = FAISS.from_documents(docs, embeddings)
|
|
|
228 |
|
229 |
|
230 |
def query_rag(chat_llm, question, search_query, vectorstore):
|
231 |
+
#retriver = vectorstore.as_retriever()
|
232 |
+
#unique_docs = retriver.get_relevant_documents(search_query, callbacks=callbacks, verbose=True)
|
233 |
+
unique_docs = vectorstore.similarity_search(search_query, k=5)
|
234 |
context = format_docs(unique_docs)
|
235 |
prompt = get_rag_prompt_template().format(query=question, context=context)
|
236 |
response = chat_llm.invoke(prompt, config={"callbacks": callbacks})
|
|
|
285 |
vector_store = vectorize(contents)
|
286 |
|
287 |
with console.status("[bold green]Querying LLM relevant context", spinner='dots8Bit'):
|
288 |
+
respomse = query_rag(chat, query, optimize_search_query, vector_store)
|
289 |
|
290 |
console.rule(f"[bold green]Response from {provider}")
|
291 |
if output == "text":
|