Shreyas094 commited on
Commit
0ccfbeb
·
verified ·
1 Parent(s): 4c65765

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -574
app.py CHANGED
@@ -5,7 +5,6 @@ import gradio as gr
5
  import pandas as pd
6
  import requests
7
  import random
8
- import feedparser
9
  import urllib.parse
10
  from tempfile import NamedTemporaryFile
11
  from typing import List
@@ -14,103 +13,19 @@ from langchain.prompts import PromptTemplate
14
  from langchain.chains import LLMChain
15
  from langchain_core.prompts import ChatPromptTemplate
16
  from langchain_community.vectorstores import FAISS
17
- from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
18
  from langchain_core.output_parsers import StrOutputParser
19
  from langchain_community.embeddings import HuggingFaceEmbeddings
20
- from langchain_text_splitters import RecursiveCharacterTextSplitter
21
  from langchain_community.llms import HuggingFaceHub
22
- from langchain_core.runnables import RunnableParallel, RunnablePassthrough
23
- from langchain_core.documents import Document
24
- from sklearn.feature_extraction.text import TfidfVectorizer
25
- from sklearn.metrics.pairwise import cosine_similarity
26
- from openpyxl import load_workbook
27
- from openpyxl.utils.dataframe import dataframe_to_rows
28
- import camelot
29
 
30
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
31
 
32
- # Memory database to store question-answer pairs
33
- memory_database = {}
34
- conversation_history = []
35
- news_database = []
36
-
37
- def load_and_split_document_basic(file):
38
  """Loads and splits the document into pages."""
39
  loader = PyPDFLoader(file.name)
40
- data = loader.load_and_split()
41
- return data
42
-
43
- def load_and_split_document_recursive(file: NamedTemporaryFile) -> List[Document]:
44
- """Loads and splits the document into chunks."""
45
- loader = PyPDFLoader(file.name)
46
- pages = loader.load()
47
-
48
- text_splitter = RecursiveCharacterTextSplitter(
49
- chunk_size=1000,
50
- chunk_overlap=200,
51
- length_function=len,
52
- )
53
-
54
- chunks = text_splitter.split_documents(pages)
55
- return chunks
56
-
57
- def load_and_split_document_basic(file: NamedTemporaryFile, parser: str) -> List[Document]:
58
- """Loads and splits the document into pages."""
59
- if parser == "PyPDF":
60
- loader = PyPDFLoader(file.name)
61
- elif parser == "PDFMiner":
62
- loader = PDFMinerLoader(file.name)
63
- elif parser == "Camelot":
64
- return load_and_split_document_camelot(file)
65
- else:
66
- raise ValueError(f"Unknown parser: {parser}")
67
-
68
  return loader.load_and_split()
69
 
70
- def load_and_split_document_recursive(file: NamedTemporaryFile, parser: str) -> List[Document]:
71
- """Loads and splits the document into chunks using recursive character text splitter."""
72
- if parser == "PyPDF":
73
- loader = PyPDFLoader(file.name)
74
- elif parser == "PDFMiner":
75
- loader = PDFMinerLoader(file.name)
76
- elif parser == "Camelot":
77
- return load_and_split_document_camelot(file)
78
- else:
79
- raise ValueError(f"Unknown parser: {parser}")
80
-
81
- pages = loader.load()
82
-
83
- text_splitter = RecursiveCharacterTextSplitter(
84
- chunk_size=1000,
85
- chunk_overlap=200,
86
- length_function=len,
87
- )
88
-
89
- chunks = text_splitter.split_documents(pages)
90
- return chunks
91
-
92
- def load_and_split_document_camelot(file: NamedTemporaryFile) -> List[Document]:
93
- """Loads and splits the document using Camelot for tables and charts."""
94
- tables = camelot.read_pdf(file.name, pages='all')
95
- documents = []
96
-
97
- for i, table in enumerate(tables):
98
- df = table.df
99
- content = df.to_string(index=False)
100
- documents.append(Document(page_content=content, metadata={"source": file.name, "table_number": i+1}))
101
-
102
- return documents
103
-
104
- def load_document(file: NamedTemporaryFile, parser: str, use_recursive_splitter: bool) -> List[Document]:
105
- """Loads the document using the specified parser and splitting method."""
106
- if parser == "Camelot":
107
- return load_and_split_document_camelot(file)
108
- elif use_recursive_splitter:
109
- return load_and_split_document_recursive(file, parser)
110
- else:
111
- return load_and_split_document_basic(file, parser)
112
-
113
- def update_vectors(files, use_recursive_splitter, selected_parser):
114
  if not files:
115
  return "Please upload at least one PDF file."
116
 
@@ -119,7 +34,7 @@ def update_vectors(files, use_recursive_splitter, selected_parser):
119
 
120
  all_data = []
121
  for file in files:
122
- data = load_document(file, selected_parser, use_recursive_splitter)
123
  all_data.extend(data)
124
  total_chunks += len(data)
125
 
@@ -131,20 +46,11 @@ def update_vectors(files, use_recursive_splitter, selected_parser):
131
 
132
  database.save_local("faiss_database")
133
 
134
- splitting_method = "recursive splitting" if use_recursive_splitter else "page-by-page splitting"
135
- return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {selected_parser} parser with {splitting_method}."
136
 
137
  def get_embeddings():
138
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
139
 
140
- def create_or_update_database(data, embeddings):
141
- if os.path.exists("faiss_database"):
142
- db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
143
- db.add_documents(data)
144
- else:
145
- db = FAISS.from_documents(data, embeddings)
146
- db.save_local("faiss_database")
147
-
148
  def clear_cache():
149
  if os.path.exists("faiss_database"):
150
  os.remove("faiss_database")
@@ -152,28 +58,6 @@ def clear_cache():
152
  else:
153
  return "No cache to clear."
154
 
155
- def get_similarity(text1, text2):
156
- vectorizer = TfidfVectorizer().fit_transform([text1, text2])
157
- return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]
158
-
159
- prompt = """
160
- Answer the question based on the following information:
161
-
162
- Conversation History:
163
- {history}
164
-
165
- Context from documents:
166
- {context}
167
-
168
- Current Question: {question}
169
-
170
- If the question is referring to the conversation history, use that information to answer.
171
- If the question is not related to the conversation history, use the context from documents to answer.
172
- If you don't have enough information to answer, say so.
173
-
174
- Provide a concise and direct answer to the question:
175
- """
176
-
177
  def get_model(temperature, top_p, repetition_penalty):
178
  return HuggingFaceHub(
179
  repo_id="mistralai/Mistral-7B-Instruct-v0.3",
@@ -197,23 +81,10 @@ def generate_chunked_response(model, prompt, max_tokens=1000, max_chunks=5):
197
  full_response += chunk
198
  return full_response.strip()
199
 
200
- def manage_conversation_history(question, answer, history, max_history=5):
201
- history.append({"question": question, "answer": answer})
202
- if len(history) > max_history:
203
- history.pop(0)
204
- return history
205
-
206
- def is_related_to_history(question, history, threshold=0.5): # Increased threshold from 0.3 to 0.5
207
- if not history:
208
- return False
209
- history_text = " ".join([f"{h['question']} {h['answer']}" for h in history])
210
- similarity = get_similarity(question, history_text)
211
- return similarity > threshold
212
-
213
  def extract_text_from_webpage(html):
214
  soup = BeautifulSoup(html, 'html.parser')
215
  for script in soup(["script", "style"]):
216
- script.extract() # Remove scripts and styles
217
  text = soup.get_text()
218
  lines = (line.strip() for line in text.splitlines())
219
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
@@ -233,7 +104,7 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
233
  escaped_term = urllib.parse.quote_plus(term)
234
  start = 0
235
  all_results = []
236
- max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
237
 
238
  print(f"Starting Google search for term: '{term}'")
239
 
@@ -292,338 +163,13 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
292
  start += len(result_block)
293
 
294
  print(f"Search completed. Total results: {len(all_results)}")
295
- print("Search results:")
296
- for i, result in enumerate(all_results, 1):
297
- print(f"Result {i}:")
298
- print(f" Link: {result['link']}")
299
- if result['text']:
300
- print(f" Text: {result['text'][:100]}...") # Print first 100 characters
301
- else:
302
- print(" Text: None")
303
- print("End of search results")
304
-
305
  if not all_results:
306
  print("No search results found. Returning a default message.")
307
  return [{"link": None, "text": "No information found in the web search results."}]
308
 
309
  return all_results
310
 
311
- def fetch_google_news_rss(query, num_results=10):
312
- base_url = "https://news.google.com/rss/search"
313
- params = {
314
- "q": query,
315
- "hl": "en-US",
316
- "gl": "US",
317
- "ceid": "US:en"
318
- }
319
- url = f"{base_url}?{urllib.parse.urlencode(params)}"
320
-
321
- try:
322
- feed = feedparser.parse(url)
323
- articles = []
324
-
325
- for entry in feed.entries[:num_results]:
326
- article = {
327
- "published_date": entry.get("published", "N/A"),
328
- "title": entry.get("title", "N/A"),
329
- "url": entry.get("link", "N/A"),
330
- "content": entry.get("summary", "N/A")
331
- }
332
- articles.append(article)
333
-
334
- return articles
335
- except Exception as e:
336
- print(f"Error fetching news: {str(e)}")
337
- return []
338
-
339
- def summarize_news_content(content, model):
340
- prompt_template = """
341
- Summarize the following news article in a concise manner:
342
- {content}
343
-
344
- Summary:
345
- """
346
- prompt = ChatPromptTemplate.from_template(prompt_template)
347
- formatted_prompt = prompt.format(content=content)
348
- full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
349
-
350
- # Extract only the summary part
351
- summary_parts = full_response.split("Summary:")
352
- if len(summary_parts) > 1:
353
- summary = summary_parts[-1].strip()
354
- else:
355
- summary = full_response.strip()
356
-
357
- # Create a cleaned version of the summary
358
- lines = summary.split('\n')
359
- cleaned_lines = [line for line in lines if not line.strip().startswith(("Human:", "Assistant:", "Summary:"))]
360
- cleaned_summary = ' '.join(cleaned_lines).strip()
361
-
362
- return summary, cleaned_summary
363
-
364
- def process_news(query, temperature, top_p, repetition_penalty, news_source):
365
- model = get_model(temperature, top_p, repetition_penalty)
366
- embed = get_embeddings()
367
-
368
- if news_source in website_configs:
369
- articles = fetch_news_from_website(news_source)
370
- else:
371
- return f"Invalid news source selected: {news_source}"
372
-
373
- if not articles:
374
- return f"No news articles found for {news_source}."
375
-
376
- processed_articles = []
377
-
378
- for article in articles:
379
- try:
380
- # Remove HTML tags from content
381
- clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
382
-
383
- # If content is very short, use the title as content
384
- if len(clean_content) < 50:
385
- clean_content = article["title"]
386
-
387
- full_summary, cleaned_summary = summarize_news_content(clean_content, model)
388
- relevance_score = calculate_relevance_score(cleaned_summary, model)
389
-
390
- processed_article = {
391
- "published_date": article["published_date"],
392
- "title": article["title"],
393
- "url": article["url"],
394
- "content": clean_content,
395
- "summary": full_summary,
396
- "cleaned_summary": cleaned_summary,
397
- "relevance_score": relevance_score
398
- }
399
- processed_articles.append(processed_article)
400
- except Exception as e:
401
- print(f"Error processing article: {str(e)}")
402
-
403
- if not processed_articles:
404
- return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
405
-
406
- # Add processed articles to the database
407
- docs = [Document(page_content=article["cleaned_summary"], metadata={
408
- "source": article["url"],
409
- "title": article["title"],
410
- "published_date": article["published_date"],
411
- "relevance_score": article["relevance_score"]
412
- }) for article in processed_articles]
413
-
414
- try:
415
- if os.path.exists("faiss_database"):
416
- database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
417
- database.add_documents(docs)
418
- else:
419
- database = FAISS.from_documents(docs, embed)
420
-
421
- database.save_local("faiss_database")
422
-
423
- # Update news_database for excel export
424
- global news_database
425
- news_database = processed_articles
426
-
427
- return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
428
- except Exception as e:
429
- return f"Error adding articles to the database: {str(e)}"
430
-
431
- website_configs = {
432
- "Golomt Bank": {
433
- "base_url": "https://golomtbank.com/en/rnews",
434
- "article_selector": 'div.entry-post.gt-box-shadow-2',
435
- "title_selector": 'h2.entry-title',
436
- "date_selector": 'div.entry-date.gt-meta',
437
- "link_selector": 'a',
438
- "content_selector": 'div.entry-content',
439
- "next_page_selector": 'a.next',
440
- "url_prefix": "https://golomtbank.com"
441
- },
442
- "Bank of America": {
443
- "base_url": "https://newsroom.bankofamerica.com/content/newsroom/press-releases.html?page=1&amp;year=all&amp;category=press-release-categories/corporate-and-financial-news&amp;categTitle=Corporate%20and%20Financial%20News",
444
- "article_selector": 'div.card bg-bank-gray-2',
445
- "title_selector": 'h2.pr-list-head',
446
- "date_selector": 'div.prlist-date',
447
- "link_selector": 'a',
448
- "content_selector": 'div.richtext text',
449
- "next_page_selector": 'a.brand-SystemRight',
450
- "url_prefix": "https://newsroom.bankofamerica.com"
451
- },
452
- # Add more banks as needed
453
- }
454
-
455
-
456
-
457
- def fetch_articles_from_page(url, config):
458
- response = requests.get(url)
459
- response.raise_for_status()
460
- soup = BeautifulSoup(response.content, 'html.parser')
461
- articles = soup.find_all(config['article_selector'].split('.')[0], class_=config['article_selector'].split('.')[-1])
462
- return articles, soup
463
-
464
- def extract_articles(articles, config):
465
- article_data = []
466
- for article in articles:
467
- title_div = article.find(config['title_selector'].split('.')[0], class_=config['title_selector'].split('.')[-1])
468
- title = title_div.get_text(strip=True) if title_div else "No Title"
469
-
470
- date_div = article.find(config['date_selector'].split('.')[0], class_=config['date_selector'].split('.')[-1])
471
- date = date_div.get_text(strip=True) if date_div else "No Date"
472
-
473
- link_tag = article.find(config['link_selector'])
474
- link = link_tag['href'] if link_tag else "No Link"
475
- if not link.startswith('http'):
476
- link = config['url_prefix'] + link
477
-
478
- article_response = requests.get(link)
479
- article_response.raise_for_status()
480
- article_soup = BeautifulSoup(article_response.content, 'html.parser')
481
- article_content_div = article_soup.find(config['content_selector'].split('.')[0], class_=config['content_selector'].split('.')[-1])
482
- article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
483
-
484
- article_data.append({
485
- 'title': title,
486
- 'date': date,
487
- 'link': link,
488
- 'content': article_content
489
- })
490
- return article_data
491
-
492
- def fetch_news_from_website(website_key, num_results=20):
493
- config = website_configs.get(website_key)
494
- if not config:
495
- return f"No configuration found for website: {website_key}"
496
-
497
- base_url = config['base_url']
498
- current_page_url = base_url
499
- all_articles = []
500
-
501
- try:
502
- while len(all_articles) < num_results:
503
- print(f"Fetching articles from: {current_page_url}")
504
- articles, soup = fetch_articles_from_page(current_page_url, config)
505
- if not articles:
506
- print("No articles found on this page.")
507
- break
508
- all_articles.extend(extract_articles(articles, config))
509
- print(f"Total articles fetched so far: {len(all_articles)}")
510
- if len(all_articles) >= num_results:
511
- all_articles = all_articles[:num_results]
512
- break
513
- next_page_link = soup.find(config['next_page_selector'])
514
- if not next_page_link:
515
- print("No next page link found.")
516
- break
517
- current_page_url = next_page_link['href']
518
- if not current_page_url.startswith('http'):
519
- current_page_url = config['url_prefix'] + current_page_url
520
-
521
- return [
522
- {
523
- "published_date": article['date'],
524
- "title": article['title'],
525
- "url": article['link'],
526
- "content": article['content']
527
- } for article in all_articles
528
- ]
529
- except Exception as e:
530
- print(f"Error fetching news from {website_key}: {str(e)}")
531
- return []
532
-
533
- def export_news_to_excel():
534
- global news_database
535
-
536
- if not news_database:
537
- return "No articles to export. Please fetch news first."
538
-
539
- print("Exporting the following articles:")
540
- for article in news_database:
541
- print(f"Title: {article['title']}, Score: {article.get('relevance_score', 'N/A')}")
542
-
543
- df = pd.DataFrame(news_database)
544
-
545
- # Ensure relevance_score is present and convert to float
546
- if 'relevance_score' not in df.columns:
547
- df['relevance_score'] = 0.0
548
- else:
549
- df['relevance_score'] = pd.to_numeric(df['relevance_score'], errors='coerce').fillna(0.0)
550
-
551
- # Use the cleaned summary for the Excel export
552
- if 'cleaned_summary' in df.columns:
553
- df['summary'] = df['cleaned_summary']
554
- df = df.drop(columns=['cleaned_summary'])
555
-
556
- # Reorder columns to put relevance_score after summary
557
- columns = ['published_date', 'title', 'url', 'content', 'summary', 'relevance_score']
558
- df = df[[col for col in columns if col in df.columns]]
559
-
560
- print("Final DataFrame before export:")
561
- print(df[['title', 'relevance_score']])
562
-
563
- with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
564
- excel_path = tmp.name
565
- df.to_excel(excel_path, index=False, engine='openpyxl')
566
- print(f"Excel file saved to: {excel_path}")
567
- print("Final relevance scores before export:")
568
- for article in news_database:
569
- print(f"Title: {article['title']}, Score: {article.get('relevance_score', 'N/A')}")
570
-
571
- return excel_path
572
-
573
- def calculate_relevance_score(summary, model):
574
- prompt_template = PromptTemplate(
575
- input_variables=["summary"],
576
- template="""You are a financial analyst tasked with providing a relevance score to news summaries.
577
- The score should be based on the financial significance and impact of the news.
578
-
579
- Consider the following factors when assigning relevance:
580
- - Earnings reports and financial performance
581
- - Debt issuance or restructuring
582
- - Mergers, acquisitions, or divestments
583
- - Changes in key leadership (e.g., CEO, CFO)
584
- - Regulatory changes or legal issues affecting the company
585
- - Major product launches or market expansion
586
- - Significant shifts in market share or competitive landscape
587
- - Macroeconomic factors directly impacting the company or industry
588
- - Stock price movements and trading volume changes
589
- - Dividend announcements or changes in capital allocation
590
- - Credit rating changes
591
- - Material financial events (e.g., bankruptcy, major contracts)
592
-
593
- Use the following scoring guide:
594
- - 0.00-0.20: Not relevant to finance or economics
595
- - 0.21-0.40: Slightly relevant, but minimal financial impact
596
- - 0.41-0.60: Moderately relevant, some financial implications
597
- - 0.61-0.80: Highly relevant, significant financial impact
598
- - 0.81-1.00: Extremely relevant, major financial implications
599
-
600
- Provide a score between 0.00 and 1.00, where 0.00 is not relevant at all, and 1.00 is extremely relevant from a financial perspective.
601
-
602
- Summary: {summary}
603
-
604
- Relevance Score:"""
605
- )
606
-
607
- chain = LLMChain(llm=model, prompt=prompt_template)
608
- response = chain.run(summary=summary)
609
-
610
- print(f"Raw relevance score response: {response}") # Debug print
611
-
612
- try:
613
- # Extract the score from the response
614
- score_match = re.search(r'Relevance Score:\s*(\d+\.\d+)', response)
615
- if score_match:
616
- score = float(score_match.group(1))
617
- final_score = min(max(score, 0.00), 1.00) # Ensure the score is between 0.00 and 1.00
618
- print(f"Processed relevance score: {final_score}") # Debug print
619
- return final_score
620
- else:
621
- raise ValueError("No relevance score found in the response")
622
- except ValueError as e:
623
- print(f"Error parsing relevance score: {e}")
624
- return 0.00
625
-
626
-
627
  def rephrase_for_search(query, model):
628
  rephrase_prompt = PromptTemplate(
629
  input_variables=["query"],
@@ -640,12 +186,9 @@ def rephrase_for_search(query, model):
640
  chain = LLMChain(llm=model, prompt=rephrase_prompt)
641
  response = chain.run(query=query).strip()
642
 
643
- # Remove any potential "Rephrased query:" prefix
644
  rephrased_query = response.replace("Rephrased query:", "").strip()
645
 
646
- # If the rephrased query is too similar to the original, extract keywords
647
  if rephrased_query.lower() == query.lower() or len(rephrased_query) > len(query) * 1.5:
648
- # Simple keyword extraction: remove common words and punctuation
649
  common_words = set(['the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'over', 'after'])
650
  keywords = [word.lower() for word in query.split() if word.lower() not in common_words]
651
  keywords = [word for word in keywords if word.isalnum()]
@@ -653,9 +196,7 @@ def rephrase_for_search(query, model):
653
 
654
  return rephrased_query
655
 
656
- def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
657
- global conversation_history
658
-
659
  if not question:
660
  return "Please enter a question."
661
 
@@ -667,7 +208,6 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, g
667
  else:
668
  database = None
669
 
670
- # In the ask_question function:
671
  if web_search:
672
  original_query = question
673
  rephrased_query = rephrase_for_search(original_query, model)
@@ -700,48 +240,29 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, g
700
  """
701
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
702
  formatted_prompt = prompt_val.format(context=context_str, original_question=question, rephrased_query=rephrased_query)
703
-
704
- elif google_news_rss:
705
  if database is None:
706
- return "No news articles available. Please fetch news articles first."
707
 
708
  retriever = database.as_retriever()
709
  relevant_docs = retriever.get_relevant_documents(question)
710
- context_str = "\n".join([f"Title: {doc.metadata.get('title', 'N/A')}\nURL: {doc.metadata.get('source', 'N/A')}\nSummary: {doc.page_content}" for doc in relevant_docs])
711
 
712
  prompt_template = """
713
- Answer the question based on the following news summaries:
714
- News Summaries:
715
  {context}
716
  Current Question: {question}
717
- If the news summaries don't contain relevant information, state that the information is not available in the news articles.
718
- Provide a concise and direct answer to the question without mentioning the news summaries or these instructions:
719
  """
720
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
721
  formatted_prompt = prompt_val.format(context=context_str, question=question)
722
- else:
723
- if database is None:
724
- return "No documents available. Please upload documents, enable web search, or fetch news articles to answer questions."
725
-
726
- history_str = "\n".join([f"Q: {item['question']}\nA: {item['answer']}" for item in conversation_history])
727
-
728
- # Always retrieve relevant documents
729
- retriever = database.as_retriever()
730
- relevant_docs = retriever.get_relevant_documents(question)
731
- doc_context = "\n".join([doc.page_content for doc in relevant_docs])
732
-
733
- # Combine document context with conversation history
734
- context_str = f"Document context:\n{doc_context}\n\nConversation history:\n{history_str}"
735
-
736
- prompt_val = ChatPromptTemplate.from_template(prompt)
737
- formatted_prompt = prompt_val.format(history=history_str, context=context_str, question=question)
738
 
739
  full_response = generate_chunked_response(model, formatted_prompt)
740
 
741
- # Extract only the part after the last occurrence of a prompt-like sentence
742
  answer_patterns = [
743
  r"Provide a concise and direct answer to the question without mentioning the web search or these instructions:",
744
- r"Provide a concise and direct answer to the question without mentioning the news summaries or these instructions:",
745
  r"Provide a concise and direct answer to the question:",
746
  r"Answer:",
747
  r"Provide a concise and direct answer to the original question without mentioning the web search or these instructions:"
@@ -753,111 +274,38 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, g
753
  answer = match[-1].strip()
754
  break
755
  else:
756
- # If no pattern is found, return the full response
757
  answer = full_response.strip()
758
 
759
- if not web_search and not google_news_rss:
760
- memory_database[question] = answer
761
- conversation_history = manage_conversation_history(question, answer, conversation_history)
762
-
763
  return answer
764
 
765
- def extract_db_to_excel():
766
- embed = get_embeddings()
767
- database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
768
-
769
- documents = database.docstore._dict.values()
770
- data = [{"page_content": doc.page_content, "metadata": json.dumps(doc.metadata)} for doc in documents]
771
- df = pd.DataFrame(data)
772
-
773
- with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
774
- excel_path = tmp.name
775
- df.to_excel(excel_path, index=False)
776
-
777
- return excel_path
778
-
779
- def export_memory_db_to_excel():
780
- data = [{"question": question, "answer": answer} for question, answer in memory_database.items()]
781
- df_memory = pd.DataFrame(data)
782
-
783
- data_history = [{"question": item["question"], "answer": item["answer"]} for item in conversation_history]
784
- df_history = pd.DataFrame(data_history)
785
-
786
- with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
787
- excel_path = tmp.name
788
- with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
789
- df_memory.to_excel(writer, sheet_name='Memory Database', index=False)
790
- df_history.to_excel(writer, sheet_name='Conversation History', index=False)
791
-
792
- return excel_path
793
-
794
  # Gradio interface
795
  with gr.Blocks() as demo:
796
- gr.Markdown("# Chat with your PDF documents and News")
797
 
798
  with gr.Row():
799
  file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
800
  update_button = gr.Button("Update Vector Store")
801
- use_recursive_splitter = gr.Checkbox(label="Use Recursive Text Splitter", value=False)
802
- parser_dropdown = gr.Dropdown(
803
- choices=["PyPDF", "PDFMiner", "Camelot"],
804
- label="Select Parser",
805
- value="PyPDF"
806
- )
807
 
808
  update_output = gr.Textbox(label="Update Status")
809
- update_button.click(update_vectors, inputs=[file_input, use_recursive_splitter, parser_dropdown], outputs=update_output)
810
 
811
  with gr.Row():
812
  with gr.Column(scale=2):
813
  chatbot = gr.Chatbot(label="Conversation")
814
- question_input = gr.Textbox(label="Ask a question about your documents or news")
815
  submit_button = gr.Button("Submit")
816
  with gr.Column(scale=1):
817
  temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
818
  top_p_slider = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.1)
819
  repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
820
  web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
821
- google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
822
 
823
- def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
824
- answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
825
  history.append((question, answer))
826
  return "", history
827
 
828
- submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox, google_news_rss_checkbox], outputs=[question_input, chatbot])
829
-
830
- with gr.Row():
831
- news_query_input = gr.Textbox(label="News Query")
832
- news_source_dropdown = gr.Dropdown(
833
- choices=list(website_configs.keys()),
834
- label="Select News Source",
835
- value=list(website_configs.keys())[0]
836
- )
837
- fetch_news_button = gr.Button("Fetch News")
838
-
839
- news_fetch_output = gr.Textbox(label="News Fetch Status")
840
-
841
- def fetch_news(query, temperature, top_p, repetition_penalty, news_source):
842
- return process_news(query, temperature, top_p, repetition_penalty, news_source)
843
-
844
- fetch_news_button.click(
845
- fetch_news,
846
- inputs=[news_query_input, temperature_slider, top_p_slider, repetition_penalty_slider, news_source_dropdown],
847
- outputs=news_fetch_output
848
- )
849
-
850
- extract_button = gr.Button("Extract Database to Excel")
851
- excel_output = gr.File(label="Download Excel File")
852
- extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
853
-
854
- export_memory_button = gr.Button("Export Memory Database to Excel")
855
- memory_excel_output = gr.File(label="Download Memory Excel File")
856
- export_memory_button.click(export_memory_db_to_excel, inputs=[], outputs=memory_excel_output)
857
-
858
- export_news_button = gr.Button("Download News Excel File")
859
- news_excel_output = gr.File(label="Download News Excel File")
860
- export_news_button.click(export_news_to_excel, inputs=[], outputs=news_excel_output)
861
 
862
  clear_button = gr.Button("Clear Cache")
863
  clear_output = gr.Textbox(label="Cache Status")
 
5
  import pandas as pd
6
  import requests
7
  import random
 
8
  import urllib.parse
9
  from tempfile import NamedTemporaryFile
10
  from typing import List
 
13
  from langchain.chains import LLMChain
14
  from langchain_core.prompts import ChatPromptTemplate
15
  from langchain_community.vectorstores import FAISS
16
+ from langchain_community.document_loaders import PyPDFLoader
17
  from langchain_core.output_parsers import StrOutputParser
18
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
19
  from langchain_community.llms import HuggingFaceHub
 
 
 
 
 
 
 
20
 
21
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
22
 
23
+ def load_document(file: NamedTemporaryFile) -> List[Document]:
 
 
 
 
 
24
  """Loads and splits the document into pages."""
25
  loader = PyPDFLoader(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return loader.load_and_split()
27
 
28
+ def update_vectors(files):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  if not files:
30
  return "Please upload at least one PDF file."
31
 
 
34
 
35
  all_data = []
36
  for file in files:
37
+ data = load_document(file)
38
  all_data.extend(data)
39
  total_chunks += len(data)
40
 
 
46
 
47
  database.save_local("faiss_database")
48
 
49
+ return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
 
50
 
51
  def get_embeddings():
52
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
53
 
 
 
 
 
 
 
 
 
54
  def clear_cache():
55
  if os.path.exists("faiss_database"):
56
  os.remove("faiss_database")
 
58
  else:
59
  return "No cache to clear."
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def get_model(temperature, top_p, repetition_penalty):
62
  return HuggingFaceHub(
63
  repo_id="mistralai/Mistral-7B-Instruct-v0.3",
 
81
  full_response += chunk
82
  return full_response.strip()
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def extract_text_from_webpage(html):
85
  soup = BeautifulSoup(html, 'html.parser')
86
  for script in soup(["script", "style"]):
87
+ script.extract()
88
  text = soup.get_text()
89
  lines = (line.strip() for line in text.splitlines())
90
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
104
  escaped_term = urllib.parse.quote_plus(term)
105
  start = 0
106
  all_results = []
107
+ max_chars_per_page = 8000
108
 
109
  print(f"Starting Google search for term: '{term}'")
110
 
 
163
  start += len(result_block)
164
 
165
  print(f"Search completed. Total results: {len(all_results)}")
166
+
 
 
 
 
 
 
 
 
 
167
  if not all_results:
168
  print("No search results found. Returning a default message.")
169
  return [{"link": None, "text": "No information found in the web search results."}]
170
 
171
  return all_results
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  def rephrase_for_search(query, model):
174
  rephrase_prompt = PromptTemplate(
175
  input_variables=["query"],
 
186
  chain = LLMChain(llm=model, prompt=rephrase_prompt)
187
  response = chain.run(query=query).strip()
188
 
 
189
  rephrased_query = response.replace("Rephrased query:", "").strip()
190
 
 
191
  if rephrased_query.lower() == query.lower() or len(rephrased_query) > len(query) * 1.5:
 
192
  common_words = set(['the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'over', 'after'])
193
  keywords = [word.lower() for word in query.split() if word.lower() not in common_words]
194
  keywords = [word for word in keywords if word.isalnum()]
 
196
 
197
  return rephrased_query
198
 
199
+ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
 
 
200
  if not question:
201
  return "Please enter a question."
202
 
 
208
  else:
209
  database = None
210
 
 
211
  if web_search:
212
  original_query = question
213
  rephrased_query = rephrase_for_search(original_query, model)
 
240
  """
241
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
242
  formatted_prompt = prompt_val.format(context=context_str, original_question=question, rephrased_query=rephrased_query)
243
+ else:
 
244
  if database is None:
245
+ return "No documents available. Please upload documents or enable web search to answer questions."
246
 
247
  retriever = database.as_retriever()
248
  relevant_docs = retriever.get_relevant_documents(question)
249
+ context_str = "\n".join([doc.page_content for doc in relevant_docs])
250
 
251
  prompt_template = """
252
+ Answer the question based on the following context:
253
+ Context:
254
  {context}
255
  Current Question: {question}
256
+ If the context doesn't contain relevant information, state that the information is not available.
257
+ Provide a concise and direct answer to the question:
258
  """
259
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
260
  formatted_prompt = prompt_val.format(context=context_str, question=question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  full_response = generate_chunked_response(model, formatted_prompt)
263
 
 
264
  answer_patterns = [
265
  r"Provide a concise and direct answer to the question without mentioning the web search or these instructions:",
 
266
  r"Provide a concise and direct answer to the question:",
267
  r"Answer:",
268
  r"Provide a concise and direct answer to the original question without mentioning the web search or these instructions:"
 
274
  answer = match[-1].strip()
275
  break
276
  else:
 
277
  answer = full_response.strip()
278
 
 
 
 
 
279
  return answer
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  # Gradio interface
282
  with gr.Blocks() as demo:
283
+ gr.Markdown("# Chat with your PDF documents and Web Search")
284
 
285
  with gr.Row():
286
  file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
287
  update_button = gr.Button("Update Vector Store")
 
 
 
 
 
 
288
 
289
  update_output = gr.Textbox(label="Update Status")
290
+ update_button.click(update_vectors, inputs=[file_input], outputs=update_output)
291
 
292
  with gr.Row():
293
  with gr.Column(scale=2):
294
  chatbot = gr.Chatbot(label="Conversation")
295
+ question_input = gr.Textbox(label="Ask a question about your documents or use web search")
296
  submit_button = gr.Button("Submit")
297
  with gr.Column(scale=1):
298
  temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
299
  top_p_slider = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.1)
300
  repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
301
  web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
 
302
 
303
+ def chat(question, history, temperature, top_p, repetition_penalty, web_search):
304
+ answer = ask_question(question, temperature, top_p, repetition_penalty, web_search)
305
  history.append((question, answer))
306
  return "", history
307
 
308
+ submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  clear_button = gr.Button("Clear Cache")
311
  clear_output = gr.Textbox(label="Cache Status")