Shreyas094 commited on
Commit
9b298f8
1 Parent(s): 0d492ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -85
app.py CHANGED
@@ -25,8 +25,6 @@ import requests
25
  import random
26
  import datetime
27
  from groq import Groq
28
- import faiss
29
- import numpy as np
30
 
31
  # Automatically get the current year
32
  current_year = datetime.datetime.now().year
@@ -58,9 +56,6 @@ groq_client = Groq(api_key=GROQ_API_KEY)
58
  # Initialize the similarity model
59
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
60
 
61
- # Global variable to store the FAISS index
62
- faiss_index = None
63
- document_store = []
64
 
65
  # Set up a session with retry mechanism
66
  def requests_retry_session(
@@ -340,26 +335,8 @@ def scrape_full_content(url, max_chars=3000, timeout=5):
340
  if url.lower().endswith('.pdf'):
341
  return scrape_pdf_content(url, max_chars, timeout)
342
 
343
- # Use newspaper for non-PDF content
344
- article = Article(url)
345
- article.download()
346
- article.parse()
347
-
348
- # Combine title and text
349
- content = f"Title: {article.title}\n\n"
350
- content += article.text
351
-
352
- # Add publish date if available
353
- if article.publish_date:
354
- content += f"\n\nPublish Date: {article.publish_date}"
355
-
356
- # Add authors if available
357
- if article.authors:
358
- content += f"\n\nAuthors: {', '.join(article.authors)}"
359
-
360
- # Add top image URL if available
361
- if article.top_image:
362
- content += f"\n\nTop Image URL: {article.top_image}"
363
 
364
  # Limit the content to max_chars
365
  return content[:max_chars] if content else ""
@@ -421,46 +398,6 @@ Your response should be detailed, informative, accurate, and directly relevant t
421
  logger.error(f"Error in LLM summarization: {e}")
422
  return "Error: Unable to generate a summary. Please try again."
423
 
424
- def create_or_reset_faiss_index(dimension=384): # 384 is the dimension for 'all-MiniLM-L6-v2' model
425
- global faiss_index
426
- faiss_index = faiss.IndexFlatL2(dimension)
427
-
428
- def add_documents_to_faiss(documents):
429
- global faiss_index, document_store
430
-
431
- # Clear previous documents
432
- document_store.clear()
433
-
434
- # Create embeddings for the documents
435
- embeddings = []
436
- for doc in documents:
437
- # Combine title and content for embedding
438
- text_to_embed = f"{doc['title']} {doc['content'][:500]}" # Limit content to first 500 chars for efficiency
439
- embedding = embedding_model.encode(text_to_embed)
440
- embeddings.append(embedding)
441
- document_store.append(doc)
442
-
443
- # Convert to numpy array
444
- embeddings_array = np.array(embeddings).astype('float32')
445
-
446
- # Add to FAISS index
447
- faiss_index.add(embeddings_array)
448
-
449
- def search_similar_documents(query, k=5):
450
- global faiss_index, document_store
451
-
452
- # Create query embedding
453
- query_embedding = embedding_model.encode(query)
454
- query_embedding = np.array([query_embedding]).astype('float32')
455
-
456
- # Search in FAISS index
457
- distances, indices = faiss_index.search(query_embedding, k)
458
-
459
- # Retrieve similar documents
460
- similar_docs = [document_store[i] for i in indices[0]]
461
-
462
- return similar_docs
463
-
464
  def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
465
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
466
  try:
@@ -609,17 +546,12 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
609
 
610
  logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
611
 
612
- # After Step 5: Scrape full content for top documents
613
- # Create or reset FAISS index
614
- create_or_reset_faiss_index()
615
-
616
- # Add documents to FAISS index
617
- add_documents_to_faiss(reranked_docs[:num_results])
618
-
619
- # Search for similar documents in the vector DB
620
- similar_docs = search_similar_documents(query, k=num_results)
621
-
622
- # Prepare JSON for LLM, now including similar documents from vector DB
623
  llm_input = {
624
  "query": query,
625
  "documents": [
@@ -627,15 +559,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
627
  "title": doc['title'],
628
  "url": doc['url'],
629
  "summary": doc['summary'],
630
- "content": doc.get('content', '') # Use get() with a default value
631
  } for doc in reranked_docs[:num_results]
632
- ],
633
- "similar_documents": [
634
- {
635
- "title": doc['title'],
636
- "url": doc['url'],
637
- "content": doc.get('content', '')[:500] # Use get() with a default value and limit content for brevity
638
- } for doc in similar_docs
639
  ]
640
  }
641
 
@@ -648,6 +573,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
648
  logger.error(f"Unexpected error in search_and_scrape: {e}")
649
  return f"An unexpected error occurred during the search and scrape process: {e}"
650
 
 
651
  def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
652
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
653
 
@@ -665,7 +591,7 @@ def chat_function(message, history, num_results, max_chars, time_range, language
665
  llm_temperature=llm_temperature,
666
  model=model
667
  )
668
-
669
  yield response
670
 
671
  iface = gr.ChatInterface(
 
25
  import random
26
  import datetime
27
  from groq import Groq
 
 
28
 
29
  # Automatically get the current year
30
  current_year = datetime.datetime.now().year
 
56
  # Initialize the similarity model
57
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
58
 
 
 
 
59
 
60
  # Set up a session with retry mechanism
61
  def requests_retry_session(
 
335
  if url.lower().endswith('.pdf'):
336
  return scrape_pdf_content(url, max_chars, timeout)
337
 
338
+ # Use Newspaper3k for non-PDF content
339
+ content = scrape_with_newspaper(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  # Limit the content to max_chars
342
  return content[:max_chars] if content else ""
 
398
  logger.error(f"Error in LLM summarization: {e}")
399
  return "Error: Unable to generate a summary. Please try again."
400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
402
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
403
  try:
 
546
 
547
  logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
548
 
549
+ # Step 5: Scrape full content for top documents (up to num_results)
550
+ for doc in reranked_docs[:num_results]:
551
+ full_content = scrape_full_content(doc['url'], max_chars)
552
+ doc['full_content'] = full_content
553
+
554
+ # Prepare JSON for LLM
 
 
 
 
 
555
  llm_input = {
556
  "query": query,
557
  "documents": [
 
559
  "title": doc['title'],
560
  "url": doc['url'],
561
  "summary": doc['summary'],
562
+ "full_content": doc['full_content']
563
  } for doc in reranked_docs[:num_results]
 
 
 
 
 
 
 
564
  ]
565
  }
566
 
 
573
  logger.error(f"Unexpected error in search_and_scrape: {e}")
574
  return f"An unexpected error occurred during the search and scrape process: {e}"
575
 
576
+
577
  def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
578
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
579
 
 
591
  llm_temperature=llm_temperature,
592
  model=model
593
  )
594
+
595
  yield response
596
 
597
  iface = gr.ChatInterface(