Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,10 @@ import inspect
|
|
19 |
import logging
|
20 |
import shutil
|
21 |
from sentence_transformers import CrossEncoder
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
# Set up basic configuration for logging
|
@@ -274,7 +278,12 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
|
|
274 |
|
275 |
def duckduckgo_search(query):
|
276 |
with DDGS() as ddgs:
|
277 |
-
results = ddgs.text(query, max_results=
|
|
|
|
|
|
|
|
|
|
|
278 |
return results
|
279 |
|
280 |
class CitingSources(BaseModel):
|
@@ -420,55 +429,56 @@ After writing the document, please provide a list of sources used in your respon
|
|
420 |
if not full_response:
|
421 |
yield "I apologize, but I couldn't generate a response at this time. Please try again later."
|
422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
def create_web_search_vectors(search_results):
|
424 |
embed = get_embeddings()
|
425 |
|
426 |
documents = []
|
427 |
for result in search_results:
|
428 |
if 'body' in result:
|
429 |
-
content = f"{result['title']}\n{result['body']}\nSource: {result['href']}"
|
430 |
-
documents.append(Document(page_content=content, metadata={"source": result['href']}))
|
431 |
|
432 |
return FAISS.from_documents(documents, embed)
|
433 |
|
434 |
-
def rerank_web_results(query, documents, top_k=5):
|
435 |
-
# Initialize the cross-encoder model
|
436 |
-
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
437 |
-
|
438 |
-
# Prepare input pairs for the cross-encoder
|
439 |
-
pairs = [[query, doc.page_content] for doc in documents]
|
440 |
-
|
441 |
-
# Compute relevance scores
|
442 |
-
scores = cross_encoder.predict(pairs)
|
443 |
-
|
444 |
-
# Sort documents by score
|
445 |
-
reranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
|
446 |
-
|
447 |
-
# Return top_k documents
|
448 |
-
return [doc for doc, score in reranked_docs[:top_k]]
|
449 |
-
|
450 |
def get_response_with_search(query, model, num_calls=3, temperature=0.2):
|
451 |
search_results = duckduckgo_search(query)
|
452 |
-
|
|
|
453 |
|
454 |
if not web_search_database:
|
455 |
yield "No web search results available. Please try again.", ""
|
456 |
return
|
457 |
|
458 |
-
retriever = web_search_database.as_retriever(search_kwargs={"k":
|
459 |
relevant_docs = retriever.get_relevant_documents(query)
|
460 |
|
461 |
-
# Rerank the documents
|
462 |
-
reranked_docs = rerank_web_results(query, relevant_docs, top_k=5)
|
463 |
-
|
464 |
accumulated_response = ""
|
465 |
|
466 |
-
for i, doc in enumerate(
|
467 |
context = doc.page_content
|
468 |
source = doc.metadata.get('source', 'Unknown source')
|
|
|
469 |
|
470 |
prompt = f"""Using the following context from a web search result:
|
471 |
{context}
|
|
|
472 |
You are an expert AI assistant. Write a detailed summary of the information provided in this source that is relevant to the following user request: '{query}'
|
473 |
Base your summary strictly on the information from this source. Only include information that is directly supported by the given content.
|
474 |
If any part of the information cannot be verified from this source, clearly state that it could not be confirmed."""
|
|
|
19 |
import logging
|
20 |
import shutil
|
21 |
from sentence_transformers import CrossEncoder
|
22 |
+
from datetime import datetime
|
23 |
+
from dateutil import parser as date_parser
|
24 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
25 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
|
27 |
|
28 |
# Set up basic configuration for logging
|
|
|
278 |
|
279 |
def duckduckgo_search(query):
|
280 |
with DDGS() as ddgs:
|
281 |
+
results = list(ddgs.text(query, max_results=10))
|
282 |
+
|
283 |
+
# Add date to results, defaulting to current date if not available
|
284 |
+
for result in results:
|
285 |
+
result['date'] = date_parser.parse(result.get('published', datetime.now().isoformat()))
|
286 |
+
|
287 |
return results
|
288 |
|
289 |
class CitingSources(BaseModel):
|
|
|
429 |
if not full_response:
|
430 |
yield "I apologize, but I couldn't generate a response at this time. Please try again later."
|
431 |
|
432 |
+
def rank_results(query, results):
|
433 |
+
# Sort by date, most recent first
|
434 |
+
results.sort(key=lambda x: x['date'], reverse=True)
|
435 |
+
|
436 |
+
# Calculate relevance scores
|
437 |
+
vectorizer = TfidfVectorizer().fit_transform([query] + [f"{r['title']} {r['body']}" for r in results])
|
438 |
+
relevance_scores = cosine_similarity(vectorizer[0:1], vectorizer[1:])[0]
|
439 |
+
|
440 |
+
# Combine date priority and relevance score
|
441 |
+
for i, result in enumerate(results):
|
442 |
+
days_old = (datetime.now() - result['date']).days
|
443 |
+
date_score = 1 / (days_old + 1) # Newer articles get higher scores
|
444 |
+
result['combined_score'] = (date_score + relevance_scores[i]) / 2
|
445 |
+
|
446 |
+
# Sort by combined score and return top 3
|
447 |
+
return sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
|
448 |
+
|
449 |
def create_web_search_vectors(search_results):
|
450 |
embed = get_embeddings()
|
451 |
|
452 |
documents = []
|
453 |
for result in search_results:
|
454 |
if 'body' in result:
|
455 |
+
content = f"{result['title']}\n{result['body']}\nSource: {result['href']}\nDate: {result['date']}"
|
456 |
+
documents.append(Document(page_content=content, metadata={"source": result['href'], "date": result['date']}))
|
457 |
|
458 |
return FAISS.from_documents(documents, embed)
|
459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
def get_response_with_search(query, model, num_calls=3, temperature=0.2):
|
461 |
search_results = duckduckgo_search(query)
|
462 |
+
ranked_results = rank_results(query, search_results)
|
463 |
+
web_search_database = create_web_search_vectors(ranked_results)
|
464 |
|
465 |
if not web_search_database:
|
466 |
yield "No web search results available. Please try again.", ""
|
467 |
return
|
468 |
|
469 |
+
retriever = web_search_database.as_retriever(search_kwargs={"k": 3})
|
470 |
relevant_docs = retriever.get_relevant_documents(query)
|
471 |
|
|
|
|
|
|
|
472 |
accumulated_response = ""
|
473 |
|
474 |
+
for i, doc in enumerate(relevant_docs, 1):
|
475 |
context = doc.page_content
|
476 |
source = doc.metadata.get('source', 'Unknown source')
|
477 |
+
date = doc.metadata.get('date', 'Unknown date')
|
478 |
|
479 |
prompt = f"""Using the following context from a web search result:
|
480 |
{context}
|
481 |
+
This information is from {date}.
|
482 |
You are an expert AI assistant. Write a detailed summary of the information provided in this source that is relevant to the following user request: '{query}'
|
483 |
Base your summary strictly on the information from this source. Only include information that is directly supported by the given content.
|
484 |
If any part of the information cannot be verified from this source, clearly state that it could not be confirmed."""
|