Shreyas094 commited on
Commit
23840f9
·
verified ·
1 Parent(s): 9c1a06a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -25
app.py CHANGED
@@ -19,6 +19,10 @@ import inspect
19
  import logging
20
  import shutil
21
  from sentence_transformers import CrossEncoder
 
 
 
 
22
 
23
 
24
  # Set up basic configuration for logging
@@ -274,7 +278,12 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
274
 
275
  def duckduckgo_search(query):
276
  with DDGS() as ddgs:
277
- results = ddgs.text(query, max_results=5)
 
 
 
 
 
278
  return results
279
 
280
  class CitingSources(BaseModel):
@@ -420,55 +429,56 @@ After writing the document, please provide a list of sources used in your respon
420
  if not full_response:
421
  yield "I apologize, but I couldn't generate a response at this time. Please try again later."
422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  def create_web_search_vectors(search_results):
424
  embed = get_embeddings()
425
 
426
  documents = []
427
  for result in search_results:
428
  if 'body' in result:
429
- content = f"{result['title']}\n{result['body']}\nSource: {result['href']}"
430
- documents.append(Document(page_content=content, metadata={"source": result['href']}))
431
 
432
  return FAISS.from_documents(documents, embed)
433
 
434
- def rerank_web_results(query, documents, top_k=5):
435
- # Initialize the cross-encoder model
436
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
437
-
438
- # Prepare input pairs for the cross-encoder
439
- pairs = [[query, doc.page_content] for doc in documents]
440
-
441
- # Compute relevance scores
442
- scores = cross_encoder.predict(pairs)
443
-
444
- # Sort documents by score
445
- reranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
446
-
447
- # Return top_k documents
448
- return [doc for doc, score in reranked_docs[:top_k]]
449
-
450
  def get_response_with_search(query, model, num_calls=3, temperature=0.2):
451
  search_results = duckduckgo_search(query)
452
- web_search_database = create_web_search_vectors(search_results)
 
453
 
454
  if not web_search_database:
455
  yield "No web search results available. Please try again.", ""
456
  return
457
 
458
- retriever = web_search_database.as_retriever(search_kwargs={"k": 20}) # Retrieve more documents for reranking
459
  relevant_docs = retriever.get_relevant_documents(query)
460
 
461
- # Rerank the documents
462
- reranked_docs = rerank_web_results(query, relevant_docs, top_k=5)
463
-
464
  accumulated_response = ""
465
 
466
- for i, doc in enumerate(reranked_docs, 1):
467
  context = doc.page_content
468
  source = doc.metadata.get('source', 'Unknown source')
 
469
 
470
  prompt = f"""Using the following context from a web search result:
471
  {context}
 
472
  You are an expert AI assistant. Write a detailed summary of the information provided in this source that is relevant to the following user request: '{query}'
473
  Base your summary strictly on the information from this source. Only include information that is directly supported by the given content.
474
  If any part of the information cannot be verified from this source, clearly state that it could not be confirmed."""
 
19
  import logging
20
  import shutil
21
  from sentence_transformers import CrossEncoder
22
+ from datetime import datetime
23
+ from dateutil import parser as date_parser
24
+ from sklearn.feature_extraction.text import TfidfVectorizer
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
 
27
 
28
  # Set up basic configuration for logging
 
278
 
279
  def duckduckgo_search(query):
280
  with DDGS() as ddgs:
281
+ results = list(ddgs.text(query, max_results=10))
282
+
283
+ # Add date to results, defaulting to current date if not available
284
+ for result in results:
285
+ result['date'] = date_parser.parse(result.get('published', datetime.now().isoformat()))
286
+
287
  return results
288
 
289
  class CitingSources(BaseModel):
 
429
  if not full_response:
430
  yield "I apologize, but I couldn't generate a response at this time. Please try again later."
431
 
432
+ def rank_results(query, results):
433
+ # Sort by date, most recent first
434
+ results.sort(key=lambda x: x['date'], reverse=True)
435
+
436
+ # Calculate relevance scores
437
+ vectorizer = TfidfVectorizer().fit_transform([query] + [f"{r['title']} {r['body']}" for r in results])
438
+ relevance_scores = cosine_similarity(vectorizer[0:1], vectorizer[1:])[0]
439
+
440
+ # Combine date priority and relevance score
441
+ for i, result in enumerate(results):
442
+ days_old = (datetime.now() - result['date']).days
443
+ date_score = 1 / (days_old + 1) # Newer articles get higher scores
444
+ result['combined_score'] = (date_score + relevance_scores[i]) / 2
445
+
446
+ # Sort by combined score and return top 3
447
+ return sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
448
+
449
  def create_web_search_vectors(search_results):
450
  embed = get_embeddings()
451
 
452
  documents = []
453
  for result in search_results:
454
  if 'body' in result:
455
+ content = f"{result['title']}\n{result['body']}\nSource: {result['href']}\nDate: {result['date']}"
456
+ documents.append(Document(page_content=content, metadata={"source": result['href'], "date": result['date']}))
457
 
458
  return FAISS.from_documents(documents, embed)
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  def get_response_with_search(query, model, num_calls=3, temperature=0.2):
461
  search_results = duckduckgo_search(query)
462
+ ranked_results = rank_results(query, search_results)
463
+ web_search_database = create_web_search_vectors(ranked_results)
464
 
465
  if not web_search_database:
466
  yield "No web search results available. Please try again.", ""
467
  return
468
 
469
+ retriever = web_search_database.as_retriever(search_kwargs={"k": 3})
470
  relevant_docs = retriever.get_relevant_documents(query)
471
 
 
 
 
472
  accumulated_response = ""
473
 
474
+ for i, doc in enumerate(relevant_docs, 1):
475
  context = doc.page_content
476
  source = doc.metadata.get('source', 'Unknown source')
477
+ date = doc.metadata.get('date', 'Unknown date')
478
 
479
  prompt = f"""Using the following context from a web search result:
480
  {context}
481
+ This information is from {date}.
482
  You are an expert AI assistant. Write a detailed summary of the information provided in this source that is relevant to the following user request: '{query}'
483
  Base your summary strictly on the information from this source. Only include information that is directly supported by the given content.
484
  If any part of the information cannot be verified from this source, clearly state that it could not be confirmed."""