Shreyas094 commited on
Commit
1e878de
1 Parent(s): 128980b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -209
app.py CHANGED
@@ -5,10 +5,7 @@ import logging
5
  from urllib.parse import urlparse
6
  from requests.adapters import HTTPAdapter
7
  from requests.packages.urllib3.util.retry import Retry
8
- from trafilatura import fetch_url, extract
9
- from trafilatura import extract
10
  from requests.exceptions import Timeout
11
- from trafilatura.settings import use_config
12
  from urllib.request import urlopen, Request
13
  import json
14
  from huggingface_hub import InferenceClient
@@ -21,16 +18,10 @@ import os
21
  from dotenv import load_dotenv
22
  import certifi
23
  import requests
24
- import scrapy
25
- from scrapy.crawler import CrawlerProcess
26
- from scrapy import signals
27
- from scrapy.signalmanager import dispatcher
28
- from scrapy.utils.log import configure_logging
29
  from newspaper import Article
30
  import PyPDF2
31
  import io
32
  import requests
33
- from duckduckgo_search import DDGS
34
  import random
35
  import datetime
36
  from groq import Groq
@@ -60,49 +51,12 @@ client = InferenceClient(
60
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
61
 
62
  # Initialize Groq client
63
- groq_client = Groq(api_key=GROQ_API_KEY)
64
 
65
  # Initialize the similarity model
66
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
67
 
68
 
69
- def duckduckgo_search(query, num_results=10, time_range="", language="", safesearch=2):
70
- try:
71
- ddgs = DDGS()
72
-
73
- # Convert time_range to DuckDuckGo format
74
- if time_range == "day":
75
- timelimit = "d"
76
- elif time_range == "week":
77
- timelimit = "w"
78
- elif time_range == "month":
79
- timelimit = "m"
80
- elif time_range == "year":
81
- timelimit = "y"
82
- else:
83
- timelimit = None
84
-
85
- # Convert safesearch to DuckDuckGo format
86
- if safesearch == 0:
87
- safesearch_setting = "off"
88
- elif safesearch == 1:
89
- safesearch_setting = "moderate"
90
- else:
91
- safesearch_setting = "strict"
92
-
93
- results = ddgs.text(
94
- query,
95
- region='wt-wt',
96
- safesearch=safesearch_setting,
97
- timelimit=timelimit,
98
- max_results=num_results
99
- )
100
-
101
- return [{"url": result["href"], "title": result["title"]} for result in results]
102
- except Exception as e:
103
- logger.error(f"Error in DuckDuckGo search: {e}")
104
- return []
105
-
106
  # Set up a session with retry mechanism
107
  def requests_retry_session(
108
  retries=0,
@@ -155,45 +109,6 @@ def scrape_pdf_content(url, max_chars=3000, timeout=5):
155
  logger.error(f"Error scraping PDF content from {url}: {e}")
156
  return ""
157
 
158
-
159
- class NewsSpider(scrapy.Spider):
160
- name = 'news_spider'
161
-
162
- def __init__(self, url=None, *args, **kwargs):
163
- super(NewsSpider, self).__init__(*args, **kwargs)
164
- self.start_urls = [url] if url else []
165
-
166
- def parse(self, response):
167
- content = ' '.join(response.css('p::text').getall())
168
- self.logger.info(f"Scraped content length: {len(content)}")
169
- return {'content': content}
170
-
171
- def scrape_with_scrapy(url, timeout=30):
172
- logger.info(f"Starting to scrape with Scrapy: {url}")
173
- configure_logging(install_root_handler=False)
174
- logging.getLogger('scrapy').setLevel(logging.WARNING)
175
-
176
- results = []
177
-
178
- def spider_results(signal, sender, item, response, spider):
179
- results.append(item)
180
-
181
- process = CrawlerProcess(settings={
182
- 'LOG_ENABLED': True,
183
- 'LOG_LEVEL': 'WARNING',
184
- 'DOWNLOAD_TIMEOUT': timeout
185
- })
186
-
187
- dispatcher.connect(spider_results, signal=signals.item_scraped)
188
-
189
- process.crawl(NewsSpider, url=url)
190
- process.start()
191
-
192
- # Get the content from results
193
- if results:
194
- return results[0]['content']
195
- return ''
196
-
197
  def scrape_with_newspaper(url):
198
  if url.lower().endswith('.pdf'):
199
  return scrape_pdf_content(url)
@@ -225,55 +140,6 @@ def scrape_with_newspaper(url):
225
  logger.error(f"Error scraping {url} with Newspaper3k: {e}")
226
  return ""
227
 
228
- def scrape_with_bs4(url, session, max_chars=None):
229
- try:
230
- response = session.get(url, timeout=5)
231
- response.raise_for_status()
232
- soup = BeautifulSoup(response.content, 'html.parser')
233
-
234
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
235
-
236
- if main_content:
237
- content = main_content.get_text(strip=True, separator='\n')
238
- else:
239
- content = soup.get_text(strip=True, separator='\n')
240
-
241
- return content[:max_chars] if max_chars else content
242
- except Exception as e:
243
- logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
244
- return ""
245
-
246
- def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
247
- try:
248
- response = requests.get(url, timeout=timeout)
249
- response.raise_for_status()
250
- downloaded = response.text
251
- content = ""
252
-
253
- if use_beautifulsoup:
254
- soup = BeautifulSoup(downloaded, "lxml")
255
- # Convert BeautifulSoup object to a string
256
- html_string = str(soup)
257
- # Use Trafilatura's extract function directly on the HTML string
258
- content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
259
-
260
- # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
261
- if not content and use_beautifulsoup:
262
- logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
263
- content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
264
-
265
- # If still no content, use the URL directly
266
- if not content:
267
- content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
268
-
269
- return (content or "")[:max_chars] if max_chars else (content or "")
270
- except requests.Timeout:
271
- logger.error(f"Timeout error while scraping {url} with Trafilatura")
272
- return ""
273
- except Exception as e:
274
- logger.error(f"Error scraping {url} with Trafilatura: {e}")
275
- return ""
276
-
277
  def rephrase_query(chat_history, query, temperature=0.2):
278
  system_prompt = f"""
279
  You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
@@ -552,8 +418,8 @@ Your response should be detailed, informative, accurate, and directly relevant t
552
  logger.error(f"Error in LLM summarization: {e}")
553
  return "Error: Unable to generate a summary. Please try again."
554
 
555
- def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
556
- engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, use_duckduckgo=False, model="huggingface"):
557
  try:
558
  # Step 1: Rephrase the Query
559
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
@@ -564,78 +430,73 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
564
  return "No search needed for the provided input."
565
 
566
  # Step 2: Perform search
567
- if use_duckduckgo:
568
- search_results = duckduckgo_search(rephrased_query, num_results, time_range, language, safesearch)
569
- results = search_results # Assign DuckDuckGo results directly
570
- else:
571
- # Search query parameters
572
- params = {
573
- 'q': rephrased_query,
574
- 'format': 'json',
575
- 'time_range': time_range,
576
- 'language': language,
577
- 'category': category,
578
- 'engines': ','.join(engines),
579
- 'safesearch': safesearch
580
- }
581
-
582
- # Remove empty parameters
583
- params = {k: v for k, v in params.items() if v != ""}
584
 
585
- # If no engines are specified, set default engines
586
- if 'engines' not in params:
587
- params['engines'] = 'google' # Default to 'google' or any preferred engine
588
- logger.info("No engines specified. Defaulting to 'google'.")
589
-
590
- # Headers for SearXNG request
591
- headers = {
592
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
593
- 'Accept': 'application/json, text/javascript, */*; q=0.01',
594
- 'Accept-Language': 'en-US,en;q=0.5',
595
- 'Origin': 'https://shreyas094-searxng-local.hf.space',
596
- 'Referer': 'https://shreyas094-searxng-local.hf.space/',
597
- 'DNT': '1',
598
- 'Connection': 'keep-alive',
599
- 'Sec-Fetch-Dest': 'empty',
600
- 'Sec-Fetch-Mode': 'cors',
601
- 'Sec-Fetch-Site': 'same-origin',
602
- }
 
 
 
603
 
604
  scraped_content = []
605
  page = 1
606
  while len(scraped_content) < num_results:
607
- if not use_duckduckgo:
608
- # Update params with current page
609
- params['pageno'] = page
610
 
611
- # Send request to SearXNG
612
- logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
613
- session = requests_retry_session()
614
 
615
- try:
616
- if method.upper() == "GET":
617
- response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
618
- else: # POST
619
- response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
620
-
621
- response.raise_for_status()
622
- except requests.exceptions.RequestException as e:
623
- logger.error(f"Error during SearXNG request: {e}")
624
- return f"An error occurred during the search request: {e}"
625
 
626
- search_results = response.json()
627
- logger.debug(f"SearXNG Response: {search_results}")
628
 
629
- results = search_results.get('results', [])
630
- if not results:
631
- logger.warning(f"No more results returned from SearXNG on page {page}.")
632
- break
633
 
634
  for result in results:
635
  if len(scraped_content) >= num_results:
636
  break
637
 
638
- url = result.get('url', '') if not use_duckduckgo else result.get('url', '')
639
  title = result.get('title', 'No title')
640
 
641
  if not is_valid_url(url):
@@ -645,7 +506,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
645
  try:
646
  logger.info(f"Processing content from: {url}")
647
 
648
- content = scrape_full_content(url, scraper, max_chars, timeout)
649
 
650
  if not content:
651
  logger.warning(f"Failed to scrape content from {url}")
@@ -655,7 +516,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
655
  "title": title,
656
  "url": url,
657
  "content": content,
658
- "scraper": "pdf" if url.lower().endswith('.pdf') else scraper
659
  })
660
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
661
  except requests.exceptions.RequestException as e:
@@ -663,10 +524,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
663
  except Exception as e:
664
  logger.error(f"Unexpected error while scraping {url}: {e}")
665
 
666
- if use_duckduckgo:
667
- break # DuckDuckGo search doesn't support pagination in this implementation
668
- else:
669
- page += 1
670
 
671
  if not scraped_content:
672
  logger.warning("No content scraped from search results.")
@@ -678,7 +536,6 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
678
  relevant_documents = []
679
  unique_summaries = []
680
  for doc in scraped_content:
681
- # In the search_and_scrape function
682
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
683
  relevance, summary = assessment.split('\n', 1)
684
 
@@ -699,7 +556,6 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
699
  if not relevant_documents:
700
  logger.warning("No relevant and unique documents found.")
701
  return "No relevant and unique financial news found for the given query."
702
- logger.debug(f"Assessment result: {assessment}")
703
 
704
  # Step 4: Rerank documents based on similarity to query
705
  reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
@@ -712,7 +568,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
712
 
713
  # Step 5: Scrape full content for top documents (up to num_results)
714
  for doc in reranked_docs[:num_results]:
715
- full_content = scrape_full_content(doc['url'], scraper, max_chars)
716
  doc['full_content'] = full_content
717
 
718
  # Prepare JSON for LLM
@@ -738,14 +594,13 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
738
  return f"An unexpected error occurred during the search and scrape process: {e}"
739
 
740
 
741
- def chat_function(message, history, num_results, scraper, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, use_duckduckgo, model):
742
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
743
 
744
  response = search_and_scrape(
745
  query=message,
746
  chat_history=chat_history,
747
  num_results=num_results,
748
- scraper=scraper,
749
  max_chars=max_chars,
750
  time_range=time_range,
751
  language=language,
@@ -754,7 +609,6 @@ def chat_function(message, history, num_results, scraper, max_chars, time_range,
754
  safesearch=safesearch,
755
  method=method,
756
  llm_temperature=llm_temperature,
757
- use_duckduckgo=use_duckduckgo,
758
  model=model
759
  )
760
 
@@ -767,7 +621,6 @@ iface = gr.ChatInterface(
767
  theme=gr.Theme.from_hub("allenai/gradio-theme"),
768
  additional_inputs=[
769
  gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
770
- gr.Dropdown(["bs4", "trafilatura", "scrapy", "newspaper"], value="newspaper", label="Scraping Method"),
771
  gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
772
  gr.Dropdown(["", "day", "week", "month", "year"], value="", label="Time Range"),
773
  gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="", label="Language"),
@@ -781,7 +634,6 @@ iface = gr.ChatInterface(
781
  gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
782
  gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
783
  gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
784
- gr.Checkbox(label="Use DuckDuckGo Search", value=False),
785
  gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
786
  ],
787
  additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
 
5
  from urllib.parse import urlparse
6
  from requests.adapters import HTTPAdapter
7
  from requests.packages.urllib3.util.retry import Retry
 
 
8
  from requests.exceptions import Timeout
 
9
  from urllib.request import urlopen, Request
10
  import json
11
  from huggingface_hub import InferenceClient
 
18
  from dotenv import load_dotenv
19
  import certifi
20
  import requests
 
 
 
 
 
21
  from newspaper import Article
22
  import PyPDF2
23
  import io
24
  import requests
 
25
  import random
26
  import datetime
27
  from groq import Groq
 
51
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
52
 
53
  # Initialize Groq client
54
+ groq_client = Groq(api_key=GROQ_API_KEY)
55
 
56
  # Initialize the similarity model
57
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # Set up a session with retry mechanism
61
  def requests_retry_session(
62
  retries=0,
 
109
  logger.error(f"Error scraping PDF content from {url}: {e}")
110
  return ""
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def scrape_with_newspaper(url):
113
  if url.lower().endswith('.pdf'):
114
  return scrape_pdf_content(url)
 
140
  logger.error(f"Error scraping {url} with Newspaper3k: {e}")
141
  return ""
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def rephrase_query(chat_history, query, temperature=0.2):
144
  system_prompt = f"""
145
  You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
 
418
  logger.error(f"Error in LLM summarization: {e}")
419
  return "Error: Unable to generate a summary. Please try again."
420
 
421
+ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
422
+ engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
423
  try:
424
  # Step 1: Rephrase the Query
425
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
 
430
  return "No search needed for the provided input."
431
 
432
  # Step 2: Perform search
433
+ # Search query parameters
434
+ params = {
435
+ 'q': rephrased_query,
436
+ 'format': 'json',
437
+ 'time_range': time_range,
438
+ 'language': language,
439
+ 'category': category,
440
+ 'engines': ','.join(engines),
441
+ 'safesearch': safesearch
442
+ }
 
 
 
 
 
 
 
443
 
444
+ # Remove empty parameters
445
+ params = {k: v for k, v in params.items() if v != ""}
446
+
447
+ # If no engines are specified, set default engines
448
+ if 'engines' not in params:
449
+ params['engines'] = 'google' # Default to 'google' or any preferred engine
450
+ logger.info("No engines specified. Defaulting to 'google'.")
451
+
452
+ # Headers for SearXNG request
453
+ headers = {
454
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
455
+ 'Accept': 'application/json, text/javascript, */*; q=0.01',
456
+ 'Accept-Language': 'en-US,en;q=0.5',
457
+ 'Origin': 'https://shreyas094-searxng-local.hf.space',
458
+ 'Referer': 'https://shreyas094-searxng-local.hf.space/',
459
+ 'DNT': '1',
460
+ 'Connection': 'keep-alive',
461
+ 'Sec-Fetch-Dest': 'empty',
462
+ 'Sec-Fetch-Mode': 'cors',
463
+ 'Sec-Fetch-Site': 'same-origin',
464
+ }
465
 
466
  scraped_content = []
467
  page = 1
468
  while len(scraped_content) < num_results:
469
+ # Update params with current page
470
+ params['pageno'] = page
 
471
 
472
+ # Send request to SearXNG
473
+ logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
474
+ session = requests_retry_session()
475
 
476
+ try:
477
+ if method.upper() == "GET":
478
+ response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
479
+ else: # POST
480
+ response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
481
+
482
+ response.raise_for_status()
483
+ except requests.exceptions.RequestException as e:
484
+ logger.error(f"Error during SearXNG request: {e}")
485
+ return f"An error occurred during the search request: {e}"
486
 
487
+ search_results = response.json()
488
+ logger.debug(f"SearXNG Response: {search_results}")
489
 
490
+ results = search_results.get('results', [])
491
+ if not results:
492
+ logger.warning(f"No more results returned from SearXNG on page {page}.")
493
+ break
494
 
495
  for result in results:
496
  if len(scraped_content) >= num_results:
497
  break
498
 
499
+ url = result.get('url', '')
500
  title = result.get('title', 'No title')
501
 
502
  if not is_valid_url(url):
 
506
  try:
507
  logger.info(f"Processing content from: {url}")
508
 
509
+ content = scrape_full_content(url, max_chars, timeout)
510
 
511
  if not content:
512
  logger.warning(f"Failed to scrape content from {url}")
 
516
  "title": title,
517
  "url": url,
518
  "content": content,
519
+ "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
520
  })
521
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
522
  except requests.exceptions.RequestException as e:
 
524
  except Exception as e:
525
  logger.error(f"Unexpected error while scraping {url}: {e}")
526
 
527
+ page += 1
 
 
 
528
 
529
  if not scraped_content:
530
  logger.warning("No content scraped from search results.")
 
536
  relevant_documents = []
537
  unique_summaries = []
538
  for doc in scraped_content:
 
539
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
540
  relevance, summary = assessment.split('\n', 1)
541
 
 
556
  if not relevant_documents:
557
  logger.warning("No relevant and unique documents found.")
558
  return "No relevant and unique financial news found for the given query."
 
559
 
560
  # Step 4: Rerank documents based on similarity to query
561
  reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
 
568
 
569
  # Step 5: Scrape full content for top documents (up to num_results)
570
  for doc in reranked_docs[:num_results]:
571
+ full_content = scrape_full_content(doc['url'], max_chars)
572
  doc['full_content'] = full_content
573
 
574
  # Prepare JSON for LLM
 
594
  return f"An unexpected error occurred during the search and scrape process: {e}"
595
 
596
 
597
+ def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
598
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
599
 
600
  response = search_and_scrape(
601
  query=message,
602
  chat_history=chat_history,
603
  num_results=num_results,
 
604
  max_chars=max_chars,
605
  time_range=time_range,
606
  language=language,
 
609
  safesearch=safesearch,
610
  method=method,
611
  llm_temperature=llm_temperature,
 
612
  model=model
613
  )
614
 
 
621
  theme=gr.Theme.from_hub("allenai/gradio-theme"),
622
  additional_inputs=[
623
  gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
 
624
  gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
625
  gr.Dropdown(["", "day", "week", "month", "year"], value="", label="Time Range"),
626
  gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="", label="Language"),
 
634
  gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
635
  gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
636
  gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
 
637
  gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
638
  ],
639
  additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),