SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 8, 2024

Commit

9b298f8

verified ·

1 Parent(s): 0d492ce

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -85

app.py CHANGED Viewed

@@ -25,8 +25,6 @@ import requests
 import random
 import datetime
 from groq import Groq
-import faiss
-import numpy as np
 # Automatically get the current year
 current_year = datetime.datetime.now().year
@@ -58,9 +56,6 @@ groq_client = Groq(api_key=GROQ_API_KEY)
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Global variable to store the FAISS index
-faiss_index = None
-document_store = []
 # Set up a session with retry mechanism
 def requests_retry_session(
@@ -340,26 +335,8 @@ def scrape_full_content(url, max_chars=3000, timeout=5):
         if url.lower().endswith('.pdf'):
             return scrape_pdf_content(url, max_chars, timeout)
-        # Use newspaper for non-PDF content
-        article = Article(url)
-        article.download()
-        article.parse()
-        # Combine title and text
-        content = f"Title: {article.title}\n\n"
-        content += article.text
-        # Add publish date if available
-        if article.publish_date:
-            content += f"\n\nPublish Date: {article.publish_date}"
-        # Add authors if available
-        if article.authors:
-            content += f"\n\nAuthors: {', '.join(article.authors)}"
-        # Add top image URL if available
-        if article.top_image:
-            content += f"\n\nTop Image URL: {article.top_image}"
         # Limit the content to max_chars
         return content[:max_chars] if content else ""
@@ -421,46 +398,6 @@ Your response should be detailed, informative, accurate, and directly relevant t
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
-def create_or_reset_faiss_index(dimension=384):  # 384 is the dimension for 'all-MiniLM-L6-v2' model
-    global faiss_index
-    faiss_index = faiss.IndexFlatL2(dimension)
-def add_documents_to_faiss(documents):
-    global faiss_index, document_store
-    # Clear previous documents
-    document_store.clear()
-    # Create embeddings for the documents
-    embeddings = []
-    for doc in documents:
-        # Combine title and content for embedding
-        text_to_embed = f"{doc['title']} {doc['content'][:500]}"  # Limit content to first 500 chars for efficiency
-        embedding = embedding_model.encode(text_to_embed)
-        embeddings.append(embedding)
-        document_store.append(doc)
-    # Convert to numpy array
-    embeddings_array = np.array(embeddings).astype('float32')
-    # Add to FAISS index
-    faiss_index.add(embeddings_array)
-def search_similar_documents(query, k=5):
-    global faiss_index, document_store
-    # Create query embedding
-    query_embedding = embedding_model.encode(query)
-    query_embedding = np.array([query_embedding]).astype('float32')
-    # Search in FAISS index
-    distances, indices = faiss_index.search(query_embedding, k)
-    # Retrieve similar documents
-    similar_docs = [document_store[i] for i in indices[0]]
-    return similar_docs
 def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
     try:
@@ -609,17 +546,12 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
         logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
-        # After Step 5: Scrape full content for top documents
-        # Create or reset FAISS index
-        create_or_reset_faiss_index()
-        # Add documents to FAISS index
-        add_documents_to_faiss(reranked_docs[:num_results])
-        # Search for similar documents in the vector DB
-        similar_docs = search_similar_documents(query, k=num_results)
-        # Prepare JSON for LLM, now including similar documents from vector DB
         llm_input = {
             "query": query,
             "documents": [
@@ -627,15 +559,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
                     "title": doc['title'],
                     "url": doc['url'],
                     "summary": doc['summary'],
-                    "content": doc.get('content', '')  # Use get() with a default value
                 } for doc in reranked_docs[:num_results]
-            ],
-            "similar_documents": [
-                {
-                    "title": doc['title'],
-                    "url": doc['url'],
-                    "content": doc.get('content', '')[:500]  # Use get() with a default value and limit content for brevity
-                } for doc in similar_docs
             ]
         }
@@ -648,6 +573,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
 def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
@@ -665,7 +591,7 @@ def chat_function(message, history, num_results, max_chars, time_range, language
         llm_temperature=llm_temperature,
         model=model
     )
     yield response
 iface = gr.ChatInterface(

 import random
 import datetime
 from groq import Groq
 # Automatically get the current year
 current_year = datetime.datetime.now().year
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Set up a session with retry mechanism
 def requests_retry_session(
         if url.lower().endswith('.pdf'):
             return scrape_pdf_content(url, max_chars, timeout)
+        # Use Newspaper3k for non-PDF content
+        content = scrape_with_newspaper(url)
         # Limit the content to max_chars
         return content[:max_chars] if content else ""
         logger.error(f"Error in LLM summarization: {e}")
         return "Error: Unable to generate a summary. Please try again."
 def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
                       engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
     try:
         logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
+        # Step 5: Scrape full content for top documents (up to num_results)
+        for doc in reranked_docs[:num_results]:
+            full_content = scrape_full_content(doc['url'], max_chars)
+            doc['full_content'] = full_content
+        # Prepare JSON for LLM
         llm_input = {
             "query": query,
             "documents": [
                     "title": doc['title'],
                     "url": doc['url'],
                     "summary": doc['summary'],
+                    "full_content": doc['full_content']
                 } for doc in reranked_docs[:num_results]
             ]
         }
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
 def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
         llm_temperature=llm_temperature,
         model=model
     )
     yield response
 iface = gr.ChatInterface(