Shreyas094
commited on
Commit
•
9b298f8
1
Parent(s):
0d492ce
Update app.py
Browse files
app.py
CHANGED
@@ -25,8 +25,6 @@ import requests
|
|
25 |
import random
|
26 |
import datetime
|
27 |
from groq import Groq
|
28 |
-
import faiss
|
29 |
-
import numpy as np
|
30 |
|
31 |
# Automatically get the current year
|
32 |
current_year = datetime.datetime.now().year
|
@@ -58,9 +56,6 @@ groq_client = Groq(api_key=GROQ_API_KEY)
|
|
58 |
# Initialize the similarity model
|
59 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
60 |
|
61 |
-
# Global variable to store the FAISS index
|
62 |
-
faiss_index = None
|
63 |
-
document_store = []
|
64 |
|
65 |
# Set up a session with retry mechanism
|
66 |
def requests_retry_session(
|
@@ -340,26 +335,8 @@ def scrape_full_content(url, max_chars=3000, timeout=5):
|
|
340 |
if url.lower().endswith('.pdf'):
|
341 |
return scrape_pdf_content(url, max_chars, timeout)
|
342 |
|
343 |
-
# Use
|
344 |
-
|
345 |
-
article.download()
|
346 |
-
article.parse()
|
347 |
-
|
348 |
-
# Combine title and text
|
349 |
-
content = f"Title: {article.title}\n\n"
|
350 |
-
content += article.text
|
351 |
-
|
352 |
-
# Add publish date if available
|
353 |
-
if article.publish_date:
|
354 |
-
content += f"\n\nPublish Date: {article.publish_date}"
|
355 |
-
|
356 |
-
# Add authors if available
|
357 |
-
if article.authors:
|
358 |
-
content += f"\n\nAuthors: {', '.join(article.authors)}"
|
359 |
-
|
360 |
-
# Add top image URL if available
|
361 |
-
if article.top_image:
|
362 |
-
content += f"\n\nTop Image URL: {article.top_image}"
|
363 |
|
364 |
# Limit the content to max_chars
|
365 |
return content[:max_chars] if content else ""
|
@@ -421,46 +398,6 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
421 |
logger.error(f"Error in LLM summarization: {e}")
|
422 |
return "Error: Unable to generate a summary. Please try again."
|
423 |
|
424 |
-
def create_or_reset_faiss_index(dimension=384): # 384 is the dimension for 'all-MiniLM-L6-v2' model
|
425 |
-
global faiss_index
|
426 |
-
faiss_index = faiss.IndexFlatL2(dimension)
|
427 |
-
|
428 |
-
def add_documents_to_faiss(documents):
|
429 |
-
global faiss_index, document_store
|
430 |
-
|
431 |
-
# Clear previous documents
|
432 |
-
document_store.clear()
|
433 |
-
|
434 |
-
# Create embeddings for the documents
|
435 |
-
embeddings = []
|
436 |
-
for doc in documents:
|
437 |
-
# Combine title and content for embedding
|
438 |
-
text_to_embed = f"{doc['title']} {doc['content'][:500]}" # Limit content to first 500 chars for efficiency
|
439 |
-
embedding = embedding_model.encode(text_to_embed)
|
440 |
-
embeddings.append(embedding)
|
441 |
-
document_store.append(doc)
|
442 |
-
|
443 |
-
# Convert to numpy array
|
444 |
-
embeddings_array = np.array(embeddings).astype('float32')
|
445 |
-
|
446 |
-
# Add to FAISS index
|
447 |
-
faiss_index.add(embeddings_array)
|
448 |
-
|
449 |
-
def search_similar_documents(query, k=5):
|
450 |
-
global faiss_index, document_store
|
451 |
-
|
452 |
-
# Create query embedding
|
453 |
-
query_embedding = embedding_model.encode(query)
|
454 |
-
query_embedding = np.array([query_embedding]).astype('float32')
|
455 |
-
|
456 |
-
# Search in FAISS index
|
457 |
-
distances, indices = faiss_index.search(query_embedding, k)
|
458 |
-
|
459 |
-
# Retrieve similar documents
|
460 |
-
similar_docs = [document_store[i] for i in indices[0]]
|
461 |
-
|
462 |
-
return similar_docs
|
463 |
-
|
464 |
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
465 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
|
466 |
try:
|
@@ -609,17 +546,12 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
609 |
|
610 |
logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
|
611 |
|
612 |
-
#
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
# Search for similar documents in the vector DB
|
620 |
-
similar_docs = search_similar_documents(query, k=num_results)
|
621 |
-
|
622 |
-
# Prepare JSON for LLM, now including similar documents from vector DB
|
623 |
llm_input = {
|
624 |
"query": query,
|
625 |
"documents": [
|
@@ -627,15 +559,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
627 |
"title": doc['title'],
|
628 |
"url": doc['url'],
|
629 |
"summary": doc['summary'],
|
630 |
-
"
|
631 |
} for doc in reranked_docs[:num_results]
|
632 |
-
],
|
633 |
-
"similar_documents": [
|
634 |
-
{
|
635 |
-
"title": doc['title'],
|
636 |
-
"url": doc['url'],
|
637 |
-
"content": doc.get('content', '')[:500] # Use get() with a default value and limit content for brevity
|
638 |
-
} for doc in similar_docs
|
639 |
]
|
640 |
}
|
641 |
|
@@ -648,6 +573,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
648 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
649 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
650 |
|
|
|
651 |
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
|
652 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
653 |
|
@@ -665,7 +591,7 @@ def chat_function(message, history, num_results, max_chars, time_range, language
|
|
665 |
llm_temperature=llm_temperature,
|
666 |
model=model
|
667 |
)
|
668 |
-
|
669 |
yield response
|
670 |
|
671 |
iface = gr.ChatInterface(
|
|
|
25 |
import random
|
26 |
import datetime
|
27 |
from groq import Groq
|
|
|
|
|
28 |
|
29 |
# Automatically get the current year
|
30 |
current_year = datetime.datetime.now().year
|
|
|
56 |
# Initialize the similarity model
|
57 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
58 |
|
|
|
|
|
|
|
59 |
|
60 |
# Set up a session with retry mechanism
|
61 |
def requests_retry_session(
|
|
|
335 |
if url.lower().endswith('.pdf'):
|
336 |
return scrape_pdf_content(url, max_chars, timeout)
|
337 |
|
338 |
+
# Use Newspaper3k for non-PDF content
|
339 |
+
content = scrape_with_newspaper(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
|
341 |
# Limit the content to max_chars
|
342 |
return content[:max_chars] if content else ""
|
|
|
398 |
logger.error(f"Error in LLM summarization: {e}")
|
399 |
return "Error: Unable to generate a summary. Please try again."
|
400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
402 |
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
|
403 |
try:
|
|
|
546 |
|
547 |
logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
|
548 |
|
549 |
+
# Step 5: Scrape full content for top documents (up to num_results)
|
550 |
+
for doc in reranked_docs[:num_results]:
|
551 |
+
full_content = scrape_full_content(doc['url'], max_chars)
|
552 |
+
doc['full_content'] = full_content
|
553 |
+
|
554 |
+
# Prepare JSON for LLM
|
|
|
|
|
|
|
|
|
|
|
555 |
llm_input = {
|
556 |
"query": query,
|
557 |
"documents": [
|
|
|
559 |
"title": doc['title'],
|
560 |
"url": doc['url'],
|
561 |
"summary": doc['summary'],
|
562 |
+
"full_content": doc['full_content']
|
563 |
} for doc in reranked_docs[:num_results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
564 |
]
|
565 |
}
|
566 |
|
|
|
573 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
574 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
575 |
|
576 |
+
|
577 |
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
|
578 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
579 |
|
|
|
591 |
llm_temperature=llm_temperature,
|
592 |
model=model
|
593 |
)
|
594 |
+
|
595 |
yield response
|
596 |
|
597 |
iface = gr.ChatInterface(
|