Shreyas094
commited on
Commit
•
5a71f95
1
Parent(s):
1e878de
Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,8 @@ import requests
|
|
25 |
import random
|
26 |
import datetime
|
27 |
from groq import Groq
|
|
|
|
|
28 |
|
29 |
# Automatically get the current year
|
30 |
current_year = datetime.datetime.now().year
|
@@ -56,6 +58,9 @@ groq_client = Groq(api_key=GROQ_API_KEY)
|
|
56 |
# Initialize the similarity model
|
57 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
58 |
|
|
|
|
|
|
|
59 |
|
60 |
# Set up a session with retry mechanism
|
61 |
def requests_retry_session(
|
@@ -418,6 +423,46 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
418 |
logger.error(f"Error in LLM summarization: {e}")
|
419 |
return "Error: Unable to generate a summary. Please try again."
|
420 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
422 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
|
423 |
try:
|
@@ -566,12 +611,17 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
566 |
|
567 |
logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
|
568 |
|
569 |
-
# Step 5: Scrape full content for top documents
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
|
|
|
|
|
|
|
|
|
|
575 |
llm_input = {
|
576 |
"query": query,
|
577 |
"documents": [
|
@@ -581,10 +631,17 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
581 |
"summary": doc['summary'],
|
582 |
"full_content": doc['full_content']
|
583 |
} for doc in reranked_docs[:num_results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
584 |
]
|
585 |
}
|
586 |
|
587 |
-
# Step 6: LLM Summarization
|
588 |
llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
|
589 |
|
590 |
return llm_summary
|
@@ -593,7 +650,6 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
593 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
594 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
595 |
|
596 |
-
|
597 |
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
|
598 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
599 |
|
@@ -611,8 +667,8 @@ def chat_function(message, history, num_results, max_chars, time_range, language
|
|
611 |
llm_temperature=llm_temperature,
|
612 |
model=model
|
613 |
)
|
614 |
-
|
615 |
-
|
616 |
|
617 |
iface = gr.ChatInterface(
|
618 |
chat_function,
|
|
|
25 |
import random
|
26 |
import datetime
|
27 |
from groq import Groq
|
28 |
+
import faiss
|
29 |
+
import numpy as np
|
30 |
|
31 |
# Automatically get the current year
|
32 |
current_year = datetime.datetime.now().year
|
|
|
58 |
# Initialize the similarity model
|
59 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
60 |
|
61 |
+
# Global variable to store the FAISS index
|
62 |
+
faiss_index = None
|
63 |
+
document_store = []
|
64 |
|
65 |
# Set up a session with retry mechanism
|
66 |
def requests_retry_session(
|
|
|
423 |
logger.error(f"Error in LLM summarization: {e}")
|
424 |
return "Error: Unable to generate a summary. Please try again."
|
425 |
|
426 |
+
def create_or_reset_faiss_index(dimension=384): # 384 is the dimension for 'all-MiniLM-L6-v2' model
|
427 |
+
global faiss_index
|
428 |
+
faiss_index = faiss.IndexFlatL2(dimension)
|
429 |
+
|
430 |
+
def add_documents_to_faiss(documents):
|
431 |
+
global faiss_index, document_store
|
432 |
+
|
433 |
+
# Clear previous documents
|
434 |
+
document_store.clear()
|
435 |
+
|
436 |
+
# Create embeddings for the documents
|
437 |
+
embeddings = []
|
438 |
+
for doc in documents:
|
439 |
+
# Combine title and content for embedding
|
440 |
+
text_to_embed = f"{doc['title']} {doc['content'][:500]}" # Limit content to first 500 chars for efficiency
|
441 |
+
embedding = embedding_model.encode(text_to_embed)
|
442 |
+
embeddings.append(embedding)
|
443 |
+
document_store.append(doc)
|
444 |
+
|
445 |
+
# Convert to numpy array
|
446 |
+
embeddings_array = np.array(embeddings).astype('float32')
|
447 |
+
|
448 |
+
# Add to FAISS index
|
449 |
+
faiss_index.add(embeddings_array)
|
450 |
+
|
451 |
+
def search_similar_documents(query, k=5):
|
452 |
+
global faiss_index, document_store
|
453 |
+
|
454 |
+
# Create query embedding
|
455 |
+
query_embedding = embedding_model.encode(query)
|
456 |
+
query_embedding = np.array([query_embedding]).astype('float32')
|
457 |
+
|
458 |
+
# Search in FAISS index
|
459 |
+
distances, indices = faiss_index.search(query_embedding, k)
|
460 |
+
|
461 |
+
# Retrieve similar documents
|
462 |
+
similar_docs = [document_store[i] for i in indices[0]]
|
463 |
+
|
464 |
+
return similar_docs
|
465 |
+
|
466 |
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
467 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
|
468 |
try:
|
|
|
611 |
|
612 |
logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
|
613 |
|
614 |
+
# After Step 5: Scrape full content for top documents
|
615 |
+
# Create or reset FAISS index
|
616 |
+
create_or_reset_faiss_index()
|
617 |
+
|
618 |
+
# Add documents to FAISS index
|
619 |
+
add_documents_to_faiss(reranked_docs[:num_results])
|
620 |
+
|
621 |
+
# Search for similar documents in the vector DB
|
622 |
+
similar_docs = search_similar_documents(query, k=num_results)
|
623 |
+
|
624 |
+
# Prepare JSON for LLM, now including similar documents from vector DB
|
625 |
llm_input = {
|
626 |
"query": query,
|
627 |
"documents": [
|
|
|
631 |
"summary": doc['summary'],
|
632 |
"full_content": doc['full_content']
|
633 |
} for doc in reranked_docs[:num_results]
|
634 |
+
],
|
635 |
+
"similar_documents": [
|
636 |
+
{
|
637 |
+
"title": doc['title'],
|
638 |
+
"url": doc['url'],
|
639 |
+
"content": doc['content'][:500] # Limit content for brevity
|
640 |
+
} for doc in similar_docs
|
641 |
]
|
642 |
}
|
643 |
|
644 |
+
# Step 6: LLM Summarization (keep as is)
|
645 |
llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
|
646 |
|
647 |
return llm_summary
|
|
|
650 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
651 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
652 |
|
|
|
653 |
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
|
654 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
655 |
|
|
|
667 |
llm_temperature=llm_temperature,
|
668 |
model=model
|
669 |
)
|
670 |
+
|
671 |
+
yield response
|
672 |
|
673 |
iface = gr.ChatInterface(
|
674 |
chat_function,
|