CyranoB commited on
Commit
8d0d362
·
1 Parent(s): 4b97d27

Moved to semantic splitting and trafilatura gor html convertion

Browse files
Files changed (1) hide show
  1. search_agent.py +35 -37
search_agent.py CHANGED
@@ -33,9 +33,10 @@ from bs4 import BeautifulSoup
33
  from docopt import docopt
34
  import dotenv
35
  import pdfplumber
 
36
 
37
  from langchain_core.documents.base import Document
38
- from langchain.text_splitter import RecursiveCharacterTextSplitter
39
  from langchain.retrievers.multi_query import MultiQueryRetriever
40
  from langchain.callbacks import LangChainTracer
41
  from langchain_groq import ChatGroq
@@ -123,18 +124,19 @@ def get_sources(query, max_pages=10, domain=None):
123
  return final_results
124
 
125
  except Exception as error:
126
- #console.log('Error fetching search results:', error)
127
  raise
128
 
129
 
130
 
131
  def fetch_with_timeout(url, timeout=8):
 
132
  try:
133
  response = requests.get(url, timeout=timeout)
134
  response.raise_for_status()
135
  return response
136
  except requests.RequestException as error:
137
- #console.log(f"Skipping {url}! Error: {error}")
138
  return None
139
 
140
  def extract_main_content(html):
@@ -152,25 +154,29 @@ def process_source(source):
152
  console.log(f"Processing {source['link']}")
153
  if response:
154
  content_type = response.headers.get('Content-Type')
155
- if content_type == 'application/pdf':
156
- # The response is a PDF file
157
- pdf_content = response.content
158
- # Create a file-like object from the bytes
159
- pdf_file = io.BytesIO(pdf_content)
160
- # Extract text from PDF using pdfplumber
161
- with pdfplumber.open(pdf_file) as pdf:
162
- text = ""
163
- for page in pdf.pages:
164
- text += page.extract_text()
165
- return {**source, 'pdf_content': text}
166
- elif content_type.startswith('text/html'):
167
- # The response is an HTML file
168
- html = response.text
169
- main_content = extract_main_content(html)
170
- return {**source, 'html': main_content}
 
 
 
 
171
  else:
172
- console.log(f"Skipping {source['link']}! Unsupported content type: {content_type}")
173
- return None
174
  return None
175
 
176
  def get_links_contents(sources):
@@ -180,26 +186,17 @@ def get_links_contents(sources):
180
  # Filter out None results
181
  return [result for result in results if result is not None]
182
 
183
- def vectorize(contents, text_chunk_size=500,text_chunk_overlap=50):
184
  documents = []
185
  for content in contents:
186
- page_content = content['snippet']
187
- if 'html' in content:
188
- page_content = content['html']
189
- if 'pdf_content' in content:
190
- page_content = content['pdf_content']
191
  try:
192
  metadata = {'title': content['title'], 'source': content['link']}
193
- doc = Document(page_content=page_content, metadata=metadata)
194
  documents.append(doc)
195
  except Exception as e:
196
  console.log(f"[gray]Error processing content for {content['link']}: {e}")
197
-
198
- text_splitter = RecursiveCharacterTextSplitter(
199
- chunk_size=text_chunk_size,
200
- chunk_overlap=text_chunk_overlap
201
- )
202
- docs = text_splitter.split_documents(documents)
203
  console.log(f"Vectorizing {len(docs)} document chunks")
204
  embeddings = OpenAIEmbeddings()
205
  store = FAISS.from_documents(docs, embeddings)
@@ -231,8 +228,9 @@ def multi_query_rag(chat_llm, question, search_query, vectorstore):
231
 
232
 
233
  def query_rag(chat_llm, question, search_query, vectorstore):
234
- retriver = vectorstore.as_retriever()
235
- unique_docs = retriver.get_relevant_documents(search_query, callbacks=callbacks, verbose=True)
 
236
  context = format_docs(unique_docs)
237
  prompt = get_rag_prompt_template().format(query=question, context=context)
238
  response = chat_llm.invoke(prompt, config={"callbacks": callbacks})
@@ -287,7 +285,7 @@ if __name__ == '__main__':
287
  vector_store = vectorize(contents)
288
 
289
  with console.status("[bold green]Querying LLM relevant context", spinner='dots8Bit'):
290
- respomse = multi_query_rag(chat, query, optimize_search_query, vector_store)
291
 
292
  console.rule(f"[bold green]Response from {provider}")
293
  if output == "text":
 
33
  from docopt import docopt
34
  import dotenv
35
  import pdfplumber
36
+ from trafilatura import extract
37
 
38
  from langchain_core.documents.base import Document
39
+ from langchain_experimental.text_splitter import SemanticChunker
40
  from langchain.retrievers.multi_query import MultiQueryRetriever
41
  from langchain.callbacks import LangChainTracer
42
  from langchain_groq import ChatGroq
 
124
  return final_results
125
 
126
  except Exception as error:
127
+ console.log('Error fetching search results:', error)
128
  raise
129
 
130
 
131
 
132
  def fetch_with_timeout(url, timeout=8):
133
+
134
  try:
135
  response = requests.get(url, timeout=timeout)
136
  response.raise_for_status()
137
  return response
138
  except requests.RequestException as error:
139
+ console.log(f"Skipping {url}! Error: {error}")
140
  return None
141
 
142
  def extract_main_content(html):
 
154
  console.log(f"Processing {source['link']}")
155
  if response:
156
  content_type = response.headers.get('Content-Type')
157
+ if content_type:
158
+ if content_type.startswith('application/pdf'):
159
+ # The response is a PDF file
160
+ pdf_content = response.content
161
+ # Create a file-like object from the bytes
162
+ pdf_file = io.BytesIO(pdf_content)
163
+ # Extract text from PDF using pdfplumber
164
+ with pdfplumber.open(pdf_file) as pdf:
165
+ text = ""
166
+ for page in pdf.pages:
167
+ text += page.extract_text()
168
+ return {**source, 'page_content': text}
169
+ elif content_type.startswith('text/html'):
170
+ # The response is an HTML file
171
+ html = response.text
172
+ main_content = extract(html, output_format='txt', include_links=True)
173
+ return {**source, 'page_content': main_content}
174
+ else:
175
+ console.log(f"Skipping {source['link']}! Unsupported content type: {content_type}")
176
+ return {**source, 'page_content': source['snippet']}
177
  else:
178
+ console.log(f"Skipping {source['link']}! No content type")
179
+ return {**source, 'page_content': source['snippet']}
180
  return None
181
 
182
  def get_links_contents(sources):
 
186
  # Filter out None results
187
  return [result for result in results if result is not None]
188
 
189
+ def vectorize(contents, text_chunk_size=400,text_chunk_overlap=40):
190
  documents = []
191
  for content in contents:
 
 
 
 
 
192
  try:
193
  metadata = {'title': content['title'], 'source': content['link']}
194
+ doc = Document(page_content=content['page_content'], metadata=metadata)
195
  documents.append(doc)
196
  except Exception as e:
197
  console.log(f"[gray]Error processing content for {content['link']}: {e}")
198
+ semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")
199
+ docs = semantic_chunker.split_documents(documents)
 
 
 
 
200
  console.log(f"Vectorizing {len(docs)} document chunks")
201
  embeddings = OpenAIEmbeddings()
202
  store = FAISS.from_documents(docs, embeddings)
 
228
 
229
 
230
  def query_rag(chat_llm, question, search_query, vectorstore):
231
+ #retriver = vectorstore.as_retriever()
232
+ #unique_docs = retriver.get_relevant_documents(search_query, callbacks=callbacks, verbose=True)
233
+ unique_docs = vectorstore.similarity_search(search_query, k=5)
234
  context = format_docs(unique_docs)
235
  prompt = get_rag_prompt_template().format(query=question, context=context)
236
  response = chat_llm.invoke(prompt, config={"callbacks": callbacks})
 
285
  vector_store = vectorize(contents)
286
 
287
  with console.status("[bold green]Querying LLM relevant context", spinner='dots8Bit'):
288
+ respomse = query_rag(chat, query, optimize_search_query, vector_store)
289
 
290
  console.rule(f"[bold green]Response from {provider}")
291
  if output == "text":