SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 8, 2024

Commit

5a71f95

•

1 Parent(s): 1e878de

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -10

app.py CHANGED Viewed

@@ -25,6 +25,8 @@ import requests
 import random
 import datetime
 from groq import Groq
 # Automatically get the current year
 current_year = datetime.datetime.now().year
@@ -56,6 +58,9 @@ groq_client = Groq(api_key=GROQ_API_KEY)
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Set up a session with retry mechanism
 def requests_retry_session(
@@ -418,6 +423,46 @@ Your response should be detailed, informative, accurate, and directly relevant t
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
 def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
     try:
@@ -566,12 +611,17 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
         logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
-        # Step 5: Scrape full content for top documents (up to num_results)
-        for doc in reranked_docs[:num_results]:
-            full_content = scrape_full_content(doc['url'], max_chars)
-            doc['full_content'] = full_content
-        # Prepare JSON for LLM
         llm_input = {
             "query": query,
             "documents": [
@@ -581,10 +631,17 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
                     "summary": doc['summary'],
                     "full_content": doc['full_content']
                 } for doc in reranked_docs[:num_results]
             ]
         }
-        # Step 6: LLM Summarization
         llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
         return llm_summary
@@ -593,7 +650,6 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
 def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
@@ -611,8 +667,8 @@ def chat_function(message, history, num_results, max_chars, time_range, language
         llm_temperature=llm_temperature,
         model=model
     )
-    yield response
 iface = gr.ChatInterface(
     chat_function,

 import random
 import datetime
 from groq import Groq
+import faiss
+import numpy as np
 # Automatically get the current year
 current_year = datetime.datetime.now().year
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Global variable to store the FAISS index
+faiss_index = None
+document_store = []
 # Set up a session with retry mechanism
 def requests_retry_session(
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
+def create_or_reset_faiss_index(dimension=384):  # 384 is the dimension for 'all-MiniLM-L6-v2' model
+    global faiss_index
+    faiss_index = faiss.IndexFlatL2(dimension)
+def add_documents_to_faiss(documents):
+    global faiss_index, document_store
+    # Clear previous documents
+    document_store.clear()
+    # Create embeddings for the documents
+    embeddings = []
+    for doc in documents:
+        # Combine title and content for embedding
+        text_to_embed = f"{doc['title']} {doc['content'][:500]}"  # Limit content to first 500 chars for efficiency
+        embedding = embedding_model.encode(text_to_embed)
+        embeddings.append(embedding)
+        document_store.append(doc)
+    # Convert to numpy array
+    embeddings_array = np.array(embeddings).astype('float32')
+    # Add to FAISS index
+    faiss_index.add(embeddings_array)
+def search_similar_documents(query, k=5):
+    global faiss_index, document_store
+    # Create query embedding
+    query_embedding = embedding_model.encode(query)
+    query_embedding = np.array([query_embedding]).astype('float32')
+    # Search in FAISS index
+    distances, indices = faiss_index.search(query_embedding, k)
+    # Retrieve similar documents
+    similar_docs = [document_store[i] for i in indices[0]]
+    return similar_docs
 def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
     try:
         logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
+        # After Step 5: Scrape full content for top documents
+        # Create or reset FAISS index
+        create_or_reset_faiss_index()
+        # Add documents to FAISS index
+        add_documents_to_faiss(reranked_docs[:num_results])
+        # Search for similar documents in the vector DB
+        similar_docs = search_similar_documents(query, k=num_results)
+        # Prepare JSON for LLM, now including similar documents from vector DB
         llm_input = {
             "query": query,
             "documents": [
                     "summary": doc['summary'],
                     "full_content": doc['full_content']
                 } for doc in reranked_docs[:num_results]
+            ],
+            "similar_documents": [
+                {
+                    "title": doc['title'],
+                    "url": doc['url'],
+                    "content": doc['content'][:500]  # Limit content for brevity
+                } for doc in similar_docs
             ]
         }
+        # Step 6: LLM Summarization (keep as is)
         llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
         return llm_summary
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
 def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
         llm_temperature=llm_temperature,
         model=model
     )
+yield response
 iface = gr.ChatInterface(
     chat_function,