Shreyas094
commited on
Commit
•
1e878de
1
Parent(s):
128980b
Update app.py
Browse files
app.py
CHANGED
@@ -5,10 +5,7 @@ import logging
|
|
5 |
from urllib.parse import urlparse
|
6 |
from requests.adapters import HTTPAdapter
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
8 |
-
from trafilatura import fetch_url, extract
|
9 |
-
from trafilatura import extract
|
10 |
from requests.exceptions import Timeout
|
11 |
-
from trafilatura.settings import use_config
|
12 |
from urllib.request import urlopen, Request
|
13 |
import json
|
14 |
from huggingface_hub import InferenceClient
|
@@ -21,16 +18,10 @@ import os
|
|
21 |
from dotenv import load_dotenv
|
22 |
import certifi
|
23 |
import requests
|
24 |
-
import scrapy
|
25 |
-
from scrapy.crawler import CrawlerProcess
|
26 |
-
from scrapy import signals
|
27 |
-
from scrapy.signalmanager import dispatcher
|
28 |
-
from scrapy.utils.log import configure_logging
|
29 |
from newspaper import Article
|
30 |
import PyPDF2
|
31 |
import io
|
32 |
import requests
|
33 |
-
from duckduckgo_search import DDGS
|
34 |
import random
|
35 |
import datetime
|
36 |
from groq import Groq
|
@@ -60,49 +51,12 @@ client = InferenceClient(
|
|
60 |
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
61 |
|
62 |
# Initialize Groq client
|
63 |
-
groq_client =
|
64 |
|
65 |
# Initialize the similarity model
|
66 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
67 |
|
68 |
|
69 |
-
def duckduckgo_search(query, num_results=10, time_range="", language="", safesearch=2):
|
70 |
-
try:
|
71 |
-
ddgs = DDGS()
|
72 |
-
|
73 |
-
# Convert time_range to DuckDuckGo format
|
74 |
-
if time_range == "day":
|
75 |
-
timelimit = "d"
|
76 |
-
elif time_range == "week":
|
77 |
-
timelimit = "w"
|
78 |
-
elif time_range == "month":
|
79 |
-
timelimit = "m"
|
80 |
-
elif time_range == "year":
|
81 |
-
timelimit = "y"
|
82 |
-
else:
|
83 |
-
timelimit = None
|
84 |
-
|
85 |
-
# Convert safesearch to DuckDuckGo format
|
86 |
-
if safesearch == 0:
|
87 |
-
safesearch_setting = "off"
|
88 |
-
elif safesearch == 1:
|
89 |
-
safesearch_setting = "moderate"
|
90 |
-
else:
|
91 |
-
safesearch_setting = "strict"
|
92 |
-
|
93 |
-
results = ddgs.text(
|
94 |
-
query,
|
95 |
-
region='wt-wt',
|
96 |
-
safesearch=safesearch_setting,
|
97 |
-
timelimit=timelimit,
|
98 |
-
max_results=num_results
|
99 |
-
)
|
100 |
-
|
101 |
-
return [{"url": result["href"], "title": result["title"]} for result in results]
|
102 |
-
except Exception as e:
|
103 |
-
logger.error(f"Error in DuckDuckGo search: {e}")
|
104 |
-
return []
|
105 |
-
|
106 |
# Set up a session with retry mechanism
|
107 |
def requests_retry_session(
|
108 |
retries=0,
|
@@ -155,45 +109,6 @@ def scrape_pdf_content(url, max_chars=3000, timeout=5):
|
|
155 |
logger.error(f"Error scraping PDF content from {url}: {e}")
|
156 |
return ""
|
157 |
|
158 |
-
|
159 |
-
class NewsSpider(scrapy.Spider):
|
160 |
-
name = 'news_spider'
|
161 |
-
|
162 |
-
def __init__(self, url=None, *args, **kwargs):
|
163 |
-
super(NewsSpider, self).__init__(*args, **kwargs)
|
164 |
-
self.start_urls = [url] if url else []
|
165 |
-
|
166 |
-
def parse(self, response):
|
167 |
-
content = ' '.join(response.css('p::text').getall())
|
168 |
-
self.logger.info(f"Scraped content length: {len(content)}")
|
169 |
-
return {'content': content}
|
170 |
-
|
171 |
-
def scrape_with_scrapy(url, timeout=30):
|
172 |
-
logger.info(f"Starting to scrape with Scrapy: {url}")
|
173 |
-
configure_logging(install_root_handler=False)
|
174 |
-
logging.getLogger('scrapy').setLevel(logging.WARNING)
|
175 |
-
|
176 |
-
results = []
|
177 |
-
|
178 |
-
def spider_results(signal, sender, item, response, spider):
|
179 |
-
results.append(item)
|
180 |
-
|
181 |
-
process = CrawlerProcess(settings={
|
182 |
-
'LOG_ENABLED': True,
|
183 |
-
'LOG_LEVEL': 'WARNING',
|
184 |
-
'DOWNLOAD_TIMEOUT': timeout
|
185 |
-
})
|
186 |
-
|
187 |
-
dispatcher.connect(spider_results, signal=signals.item_scraped)
|
188 |
-
|
189 |
-
process.crawl(NewsSpider, url=url)
|
190 |
-
process.start()
|
191 |
-
|
192 |
-
# Get the content from results
|
193 |
-
if results:
|
194 |
-
return results[0]['content']
|
195 |
-
return ''
|
196 |
-
|
197 |
def scrape_with_newspaper(url):
|
198 |
if url.lower().endswith('.pdf'):
|
199 |
return scrape_pdf_content(url)
|
@@ -225,55 +140,6 @@ def scrape_with_newspaper(url):
|
|
225 |
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
226 |
return ""
|
227 |
|
228 |
-
def scrape_with_bs4(url, session, max_chars=None):
|
229 |
-
try:
|
230 |
-
response = session.get(url, timeout=5)
|
231 |
-
response.raise_for_status()
|
232 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
233 |
-
|
234 |
-
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
235 |
-
|
236 |
-
if main_content:
|
237 |
-
content = main_content.get_text(strip=True, separator='\n')
|
238 |
-
else:
|
239 |
-
content = soup.get_text(strip=True, separator='\n')
|
240 |
-
|
241 |
-
return content[:max_chars] if max_chars else content
|
242 |
-
except Exception as e:
|
243 |
-
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
244 |
-
return ""
|
245 |
-
|
246 |
-
def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
|
247 |
-
try:
|
248 |
-
response = requests.get(url, timeout=timeout)
|
249 |
-
response.raise_for_status()
|
250 |
-
downloaded = response.text
|
251 |
-
content = ""
|
252 |
-
|
253 |
-
if use_beautifulsoup:
|
254 |
-
soup = BeautifulSoup(downloaded, "lxml")
|
255 |
-
# Convert BeautifulSoup object to a string
|
256 |
-
html_string = str(soup)
|
257 |
-
# Use Trafilatura's extract function directly on the HTML string
|
258 |
-
content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
|
259 |
-
|
260 |
-
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
261 |
-
if not content and use_beautifulsoup:
|
262 |
-
logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
|
263 |
-
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
264 |
-
|
265 |
-
# If still no content, use the URL directly
|
266 |
-
if not content:
|
267 |
-
content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
|
268 |
-
|
269 |
-
return (content or "")[:max_chars] if max_chars else (content or "")
|
270 |
-
except requests.Timeout:
|
271 |
-
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
272 |
-
return ""
|
273 |
-
except Exception as e:
|
274 |
-
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
275 |
-
return ""
|
276 |
-
|
277 |
def rephrase_query(chat_history, query, temperature=0.2):
|
278 |
system_prompt = f"""
|
279 |
You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
|
@@ -552,8 +418,8 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
552 |
logger.error(f"Error in LLM summarization: {e}")
|
553 |
return "Error: Unable to generate a summary. Please try again."
|
554 |
|
555 |
-
def search_and_scrape(query, chat_history, num_results=5,
|
556 |
-
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5,
|
557 |
try:
|
558 |
# Step 1: Rephrase the Query
|
559 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
@@ -564,78 +430,73 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
564 |
return "No search needed for the provided input."
|
565 |
|
566 |
# Step 2: Perform search
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
'category': category,
|
578 |
-
'engines': ','.join(engines),
|
579 |
-
'safesearch': safesearch
|
580 |
-
}
|
581 |
-
|
582 |
-
# Remove empty parameters
|
583 |
-
params = {k: v for k, v in params.items() if v != ""}
|
584 |
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
#
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
|
|
|
|
|
|
603 |
|
604 |
scraped_content = []
|
605 |
page = 1
|
606 |
while len(scraped_content) < num_results:
|
607 |
-
|
608 |
-
|
609 |
-
params['pageno'] = page
|
610 |
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
|
626 |
-
|
627 |
-
|
628 |
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
|
634 |
for result in results:
|
635 |
if len(scraped_content) >= num_results:
|
636 |
break
|
637 |
|
638 |
-
url = result.get('url', '')
|
639 |
title = result.get('title', 'No title')
|
640 |
|
641 |
if not is_valid_url(url):
|
@@ -645,7 +506,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
645 |
try:
|
646 |
logger.info(f"Processing content from: {url}")
|
647 |
|
648 |
-
content = scrape_full_content(url,
|
649 |
|
650 |
if not content:
|
651 |
logger.warning(f"Failed to scrape content from {url}")
|
@@ -655,7 +516,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
655 |
"title": title,
|
656 |
"url": url,
|
657 |
"content": content,
|
658 |
-
"scraper": "pdf" if url.lower().endswith('.pdf') else
|
659 |
})
|
660 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
661 |
except requests.exceptions.RequestException as e:
|
@@ -663,10 +524,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
663 |
except Exception as e:
|
664 |
logger.error(f"Unexpected error while scraping {url}: {e}")
|
665 |
|
666 |
-
|
667 |
-
break # DuckDuckGo search doesn't support pagination in this implementation
|
668 |
-
else:
|
669 |
-
page += 1
|
670 |
|
671 |
if not scraped_content:
|
672 |
logger.warning("No content scraped from search results.")
|
@@ -678,7 +536,6 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
678 |
relevant_documents = []
|
679 |
unique_summaries = []
|
680 |
for doc in scraped_content:
|
681 |
-
# In the search_and_scrape function
|
682 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
683 |
relevance, summary = assessment.split('\n', 1)
|
684 |
|
@@ -699,7 +556,6 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
699 |
if not relevant_documents:
|
700 |
logger.warning("No relevant and unique documents found.")
|
701 |
return "No relevant and unique financial news found for the given query."
|
702 |
-
logger.debug(f"Assessment result: {assessment}")
|
703 |
|
704 |
# Step 4: Rerank documents based on similarity to query
|
705 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
|
@@ -712,7 +568,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
712 |
|
713 |
# Step 5: Scrape full content for top documents (up to num_results)
|
714 |
for doc in reranked_docs[:num_results]:
|
715 |
-
full_content = scrape_full_content(doc['url'],
|
716 |
doc['full_content'] = full_content
|
717 |
|
718 |
# Prepare JSON for LLM
|
@@ -738,14 +594,13 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
738 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
739 |
|
740 |
|
741 |
-
def chat_function(message, history, num_results,
|
742 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
743 |
|
744 |
response = search_and_scrape(
|
745 |
query=message,
|
746 |
chat_history=chat_history,
|
747 |
num_results=num_results,
|
748 |
-
scraper=scraper,
|
749 |
max_chars=max_chars,
|
750 |
time_range=time_range,
|
751 |
language=language,
|
@@ -754,7 +609,6 @@ def chat_function(message, history, num_results, scraper, max_chars, time_range,
|
|
754 |
safesearch=safesearch,
|
755 |
method=method,
|
756 |
llm_temperature=llm_temperature,
|
757 |
-
use_duckduckgo=use_duckduckgo,
|
758 |
model=model
|
759 |
)
|
760 |
|
@@ -767,7 +621,6 @@ iface = gr.ChatInterface(
|
|
767 |
theme=gr.Theme.from_hub("allenai/gradio-theme"),
|
768 |
additional_inputs=[
|
769 |
gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
|
770 |
-
gr.Dropdown(["bs4", "trafilatura", "scrapy", "newspaper"], value="newspaper", label="Scraping Method"),
|
771 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
772 |
gr.Dropdown(["", "day", "week", "month", "year"], value="", label="Time Range"),
|
773 |
gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="", label="Language"),
|
@@ -781,7 +634,6 @@ iface = gr.ChatInterface(
|
|
781 |
gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
|
782 |
gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
|
783 |
gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
|
784 |
-
gr.Checkbox(label="Use DuckDuckGo Search", value=False),
|
785 |
gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
|
786 |
],
|
787 |
additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
|
|
|
5 |
from urllib.parse import urlparse
|
6 |
from requests.adapters import HTTPAdapter
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
|
|
|
|
8 |
from requests.exceptions import Timeout
|
|
|
9 |
from urllib.request import urlopen, Request
|
10 |
import json
|
11 |
from huggingface_hub import InferenceClient
|
|
|
18 |
from dotenv import load_dotenv
|
19 |
import certifi
|
20 |
import requests
|
|
|
|
|
|
|
|
|
|
|
21 |
from newspaper import Article
|
22 |
import PyPDF2
|
23 |
import io
|
24 |
import requests
|
|
|
25 |
import random
|
26 |
import datetime
|
27 |
from groq import Groq
|
|
|
51 |
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
52 |
|
53 |
# Initialize Groq client
|
54 |
+
groq_client = Groq(api_key=GROQ_API_KEY)
|
55 |
|
56 |
# Initialize the similarity model
|
57 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
58 |
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
# Set up a session with retry mechanism
|
61 |
def requests_retry_session(
|
62 |
retries=0,
|
|
|
109 |
logger.error(f"Error scraping PDF content from {url}: {e}")
|
110 |
return ""
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
def scrape_with_newspaper(url):
|
113 |
if url.lower().endswith('.pdf'):
|
114 |
return scrape_pdf_content(url)
|
|
|
140 |
logger.error(f"Error scraping {url} with Newspaper3k: {e}")
|
141 |
return ""
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def rephrase_query(chat_history, query, temperature=0.2):
|
144 |
system_prompt = f"""
|
145 |
You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
|
|
|
418 |
logger.error(f"Error in LLM summarization: {e}")
|
419 |
return "Error: Unable to generate a summary. Please try again."
|
420 |
|
421 |
+
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
422 |
+
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
|
423 |
try:
|
424 |
# Step 1: Rephrase the Query
|
425 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
|
|
430 |
return "No search needed for the provided input."
|
431 |
|
432 |
# Step 2: Perform search
|
433 |
+
# Search query parameters
|
434 |
+
params = {
|
435 |
+
'q': rephrased_query,
|
436 |
+
'format': 'json',
|
437 |
+
'time_range': time_range,
|
438 |
+
'language': language,
|
439 |
+
'category': category,
|
440 |
+
'engines': ','.join(engines),
|
441 |
+
'safesearch': safesearch
|
442 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
|
444 |
+
# Remove empty parameters
|
445 |
+
params = {k: v for k, v in params.items() if v != ""}
|
446 |
+
|
447 |
+
# If no engines are specified, set default engines
|
448 |
+
if 'engines' not in params:
|
449 |
+
params['engines'] = 'google' # Default to 'google' or any preferred engine
|
450 |
+
logger.info("No engines specified. Defaulting to 'google'.")
|
451 |
+
|
452 |
+
# Headers for SearXNG request
|
453 |
+
headers = {
|
454 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
455 |
+
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
456 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
457 |
+
'Origin': 'https://shreyas094-searxng-local.hf.space',
|
458 |
+
'Referer': 'https://shreyas094-searxng-local.hf.space/',
|
459 |
+
'DNT': '1',
|
460 |
+
'Connection': 'keep-alive',
|
461 |
+
'Sec-Fetch-Dest': 'empty',
|
462 |
+
'Sec-Fetch-Mode': 'cors',
|
463 |
+
'Sec-Fetch-Site': 'same-origin',
|
464 |
+
}
|
465 |
|
466 |
scraped_content = []
|
467 |
page = 1
|
468 |
while len(scraped_content) < num_results:
|
469 |
+
# Update params with current page
|
470 |
+
params['pageno'] = page
|
|
|
471 |
|
472 |
+
# Send request to SearXNG
|
473 |
+
logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
|
474 |
+
session = requests_retry_session()
|
475 |
|
476 |
+
try:
|
477 |
+
if method.upper() == "GET":
|
478 |
+
response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
|
479 |
+
else: # POST
|
480 |
+
response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
|
481 |
+
|
482 |
+
response.raise_for_status()
|
483 |
+
except requests.exceptions.RequestException as e:
|
484 |
+
logger.error(f"Error during SearXNG request: {e}")
|
485 |
+
return f"An error occurred during the search request: {e}"
|
486 |
|
487 |
+
search_results = response.json()
|
488 |
+
logger.debug(f"SearXNG Response: {search_results}")
|
489 |
|
490 |
+
results = search_results.get('results', [])
|
491 |
+
if not results:
|
492 |
+
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
493 |
+
break
|
494 |
|
495 |
for result in results:
|
496 |
if len(scraped_content) >= num_results:
|
497 |
break
|
498 |
|
499 |
+
url = result.get('url', '')
|
500 |
title = result.get('title', 'No title')
|
501 |
|
502 |
if not is_valid_url(url):
|
|
|
506 |
try:
|
507 |
logger.info(f"Processing content from: {url}")
|
508 |
|
509 |
+
content = scrape_full_content(url, max_chars, timeout)
|
510 |
|
511 |
if not content:
|
512 |
logger.warning(f"Failed to scrape content from {url}")
|
|
|
516 |
"title": title,
|
517 |
"url": url,
|
518 |
"content": content,
|
519 |
+
"scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
|
520 |
})
|
521 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
522 |
except requests.exceptions.RequestException as e:
|
|
|
524 |
except Exception as e:
|
525 |
logger.error(f"Unexpected error while scraping {url}: {e}")
|
526 |
|
527 |
+
page += 1
|
|
|
|
|
|
|
528 |
|
529 |
if not scraped_content:
|
530 |
logger.warning("No content scraped from search results.")
|
|
|
536 |
relevant_documents = []
|
537 |
unique_summaries = []
|
538 |
for doc in scraped_content:
|
|
|
539 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
540 |
relevance, summary = assessment.split('\n', 1)
|
541 |
|
|
|
556 |
if not relevant_documents:
|
557 |
logger.warning("No relevant and unique documents found.")
|
558 |
return "No relevant and unique financial news found for the given query."
|
|
|
559 |
|
560 |
# Step 4: Rerank documents based on similarity to query
|
561 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
|
|
|
568 |
|
569 |
# Step 5: Scrape full content for top documents (up to num_results)
|
570 |
for doc in reranked_docs[:num_results]:
|
571 |
+
full_content = scrape_full_content(doc['url'], max_chars)
|
572 |
doc['full_content'] = full_content
|
573 |
|
574 |
# Prepare JSON for LLM
|
|
|
594 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
595 |
|
596 |
|
597 |
+
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
|
598 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
599 |
|
600 |
response = search_and_scrape(
|
601 |
query=message,
|
602 |
chat_history=chat_history,
|
603 |
num_results=num_results,
|
|
|
604 |
max_chars=max_chars,
|
605 |
time_range=time_range,
|
606 |
language=language,
|
|
|
609 |
safesearch=safesearch,
|
610 |
method=method,
|
611 |
llm_temperature=llm_temperature,
|
|
|
612 |
model=model
|
613 |
)
|
614 |
|
|
|
621 |
theme=gr.Theme.from_hub("allenai/gradio-theme"),
|
622 |
additional_inputs=[
|
623 |
gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
|
|
|
624 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
625 |
gr.Dropdown(["", "day", "week", "month", "year"], value="", label="Time Range"),
|
626 |
gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="", label="Language"),
|
|
|
634 |
gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
|
635 |
gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
|
636 |
gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
|
|
|
637 |
gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
|
638 |
],
|
639 |
additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
|