SearchGPTTest

Running

App Files Files Community

Shreyas094 commited on Aug 11, 2024

Commit

23840f9

verified ·

1 Parent(s): 9c1a06a

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -25

app.py CHANGED Viewed

@@ -19,6 +19,10 @@ import inspect
 import logging
 import shutil
 from sentence_transformers import CrossEncoder
 # Set up basic configuration for logging
@@ -274,7 +278,12 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
 def duckduckgo_search(query):
     with DDGS() as ddgs:
-        results = ddgs.text(query, max_results=5)
     return results
 class CitingSources(BaseModel):
@@ -420,55 +429,56 @@ After writing the document, please provide a list of sources used in your respon
     if not full_response:
         yield "I apologize, but I couldn't generate a response at this time. Please try again later."
 def create_web_search_vectors(search_results):
     embed = get_embeddings()
     documents = []
     for result in search_results:
         if 'body' in result:
-            content = f"{result['title']}\n{result['body']}\nSource: {result['href']}"
-            documents.append(Document(page_content=content, metadata={"source": result['href']}))
     return FAISS.from_documents(documents, embed)
-def rerank_web_results(query, documents, top_k=5):
-    # Initialize the cross-encoder model
-    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
-    # Prepare input pairs for the cross-encoder
-    pairs = [[query, doc.page_content] for doc in documents]
-    # Compute relevance scores
-    scores = cross_encoder.predict(pairs)
-    # Sort documents by score
-    reranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
-    # Return top_k documents
-    return [doc for doc, score in reranked_docs[:top_k]]
 def get_response_with_search(query, model, num_calls=3, temperature=0.2):
     search_results = duckduckgo_search(query)
-    web_search_database = create_web_search_vectors(search_results)
     if not web_search_database:
         yield "No web search results available. Please try again.", ""
         return
-    retriever = web_search_database.as_retriever(search_kwargs={"k": 20})  # Retrieve more documents for reranking
     relevant_docs = retriever.get_relevant_documents(query)
-    # Rerank the documents
-    reranked_docs = rerank_web_results(query, relevant_docs, top_k=5)
     accumulated_response = ""
-    for i, doc in enumerate(reranked_docs, 1):
         context = doc.page_content
         source = doc.metadata.get('source', 'Unknown source')
         prompt = f"""Using the following context from a web search result:
 {context}
 You are an expert AI assistant. Write a detailed summary of the information provided in this source that is relevant to the following user request: '{query}'
 Base your summary strictly on the information from this source. Only include information that is directly supported by the given content.
 If any part of the information cannot be verified from this source, clearly state that it could not be confirmed."""

 import logging
 import shutil
 from sentence_transformers import CrossEncoder
+from datetime import datetime
+from dateutil import parser as date_parser
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 # Set up basic configuration for logging
 def duckduckgo_search(query):
     with DDGS() as ddgs:
+        results = list(ddgs.text(query, max_results=10))
+    # Add date to results, defaulting to current date if not available
+    for result in results:
+        result['date'] = date_parser.parse(result.get('published', datetime.now().isoformat()))
     return results
 class CitingSources(BaseModel):
     if not full_response:
         yield "I apologize, but I couldn't generate a response at this time. Please try again later."
+def rank_results(query, results):
+    # Sort by date, most recent first
+    results.sort(key=lambda x: x['date'], reverse=True)
+    # Calculate relevance scores
+    vectorizer = TfidfVectorizer().fit_transform([query] + [f"{r['title']} {r['body']}" for r in results])
+    relevance_scores = cosine_similarity(vectorizer[0:1], vectorizer[1:])[0]
+    # Combine date priority and relevance score
+    for i, result in enumerate(results):
+        days_old = (datetime.now() - result['date']).days
+        date_score = 1 / (days_old + 1)  # Newer articles get higher scores
+        result['combined_score'] = (date_score + relevance_scores[i]) / 2
+    # Sort by combined score and return top 3
+    return sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
 def create_web_search_vectors(search_results):
     embed = get_embeddings()
     documents = []
     for result in search_results:
         if 'body' in result:
+            content = f"{result['title']}\n{result['body']}\nSource: {result['href']}\nDate: {result['date']}"
+            documents.append(Document(page_content=content, metadata={"source": result['href'], "date": result['date']}))
     return FAISS.from_documents(documents, embed)
 def get_response_with_search(query, model, num_calls=3, temperature=0.2):
     search_results = duckduckgo_search(query)
+    ranked_results = rank_results(query, search_results)
+    web_search_database = create_web_search_vectors(ranked_results)
     if not web_search_database:
         yield "No web search results available. Please try again.", ""
         return
+    retriever = web_search_database.as_retriever(search_kwargs={"k": 3})
     relevant_docs = retriever.get_relevant_documents(query)
     accumulated_response = ""
+    for i, doc in enumerate(relevant_docs, 1):
         context = doc.page_content
         source = doc.metadata.get('source', 'Unknown source')
+        date = doc.metadata.get('date', 'Unknown date')
         prompt = f"""Using the following context from a web search result:
 {context}
+This information is from {date}.
 You are an expert AI assistant. Write a detailed summary of the information provided in this source that is relevant to the following user request: '{query}'
 Base your summary strictly on the information from this source. Only include information that is directly supported by the given content.
 If any part of the information cannot be verified from this source, clearly state that it could not be confirmed."""