Shreyas094 commited on
Commit
7980de8
·
verified ·
1 Parent(s): 9368554

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +440 -112
app.py CHANGED
@@ -16,37 +16,335 @@ from sentence_transformers.util import pytorch_cos_sim
16
  from enum import Enum
17
  from groq import Groq
18
  import os
19
- from typing import List, Dict, Any, Set
20
  from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Load environment variables from .env file
23
  load_dotenv()
 
24
 
25
  # Initialize Groq client
26
  groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 
27
 
28
  class ScoringMethod(Enum):
29
  BM25 = "bm25"
30
  TFIDF = "tfidf"
31
  COMBINED = "combined"
32
 
33
- # First define the SafeSearch enum
34
  class SafeSearch(Enum):
35
  STRICT = 2
36
  MODERATE = 1
37
  NONE = 0
38
 
39
- # Then use it to define the options
 
 
 
40
  SAFE_SEARCH_OPTIONS = [
41
  ("Strict (2)", SafeSearch.STRICT.value),
42
  ("Moderate (1)", SafeSearch.MODERATE.value),
43
  ("None (0)", SafeSearch.NONE.value)
44
  ]
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  async def get_available_engines(session, base_url, headers):
47
  """Fetch available search engines from SearxNG instance."""
 
48
  try:
49
- # First try the search endpoint to get engines
50
  params = {
51
  "q": "test",
52
  "format": "json",
@@ -55,84 +353,22 @@ async def get_available_engines(session, base_url, headers):
55
  async with session.get(f"{base_url}/search", headers=headers, params=params) as response:
56
  data = await response.json()
57
  available_engines = set()
58
- # Extract unique engine names from the response
59
  if "search" in data:
60
  for engine_data in data["search"]:
61
  if isinstance(engine_data, dict) and "engine" in engine_data:
62
  available_engines.add(engine_data["engine"])
63
 
64
- # If no engines found, try alternate endpoint
65
  if not available_engines:
66
  async with session.get(f"{base_url}/engines", headers=headers) as response:
67
  engines_data = await response.json()
68
  available_engines = set(engine["name"] for engine in engines_data if engine.get("enabled", True))
69
 
 
70
  return list(available_engines)
71
  except Exception as e:
72
- logging.error(f'Error fetching search engines: {e}')
73
- # Return default engines if unable to fetch
74
  return ["google", "bing", "duckduckgo", "brave", "wikipedia"]
75
 
76
- def select_search_engines(available_engines: List[str]) -> Set[str]:
77
- """Let user select search engines from available options."""
78
- print("\nAvailable search engines:")
79
- engines_list = sorted(available_engines)
80
- for i, engine in enumerate(engines_list, 1):
81
- print(f"{i}. {engine}")
82
-
83
- print("\nEnter the numbers of engines you want to use (comma-separated), or 'all' for all engines:")
84
- selection = input("Your selection: ").strip().lower()
85
-
86
- if selection == 'all':
87
- return set(engines_list)
88
-
89
- try:
90
- selected_indices = [int(idx.strip()) - 1 for idx in selection.split(',')]
91
- return {engines_list[idx] for idx in selected_indices if 0 <= idx < len(engines_list)}
92
- except (ValueError, IndexError):
93
- logging.error("Invalid selection, using all engines as fallback")
94
- return set(engines_list)
95
-
96
-
97
- logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
98
-
99
- async def scrape_url(url, max_chars):
100
- logging.info(f'Scraping URL: {url}')
101
- if url.endswith(".pdf"):
102
- return await scrape_pdf(url, max_chars)
103
- else:
104
- return await scrape_html(url, max_chars)
105
-
106
- async def scrape_html(url, max_chars):
107
- try:
108
- article = Article(url)
109
- article.download()
110
- article.parse()
111
- text = article.text[:max_chars]
112
- publish_date = article.publish_date
113
- logging.info(f'Scraped HTML content from {url}')
114
- return {"content": text, "publish_date": publish_date.isoformat() if publish_date else None}
115
- except Exception as e:
116
- logging.error(f'Error scraping HTML content from {url}: {e}')
117
- return None
118
-
119
- async def scrape_pdf(url, max_chars):
120
- try:
121
- async with aiohttp.ClientSession() as session:
122
- async with session.get(url) as response:
123
- pdf_bytes = await response.read()
124
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
125
- text = ""
126
- for page_num in range(len(pdf_reader.pages)):
127
- page = pdf_reader.pages[page_num]
128
- text += page.extract_text()
129
- text = text[:max_chars]
130
- logging.info(f'Scraped PDF content from {url}')
131
- return {"content": text, "publish_date": None}
132
- except Exception as e:
133
- logging.error(f'Error scraping PDF content from {url}: {e}')
134
- return None
135
-
136
  def normalize_scores(scores):
137
  """Normalize scores to [0, 1] range using min-max normalization"""
138
  if not isinstance(scores, np.ndarray):
@@ -153,41 +389,46 @@ def normalize_scores(scores):
153
 
154
  async def calculate_bm25(query, documents):
155
  """Calculate BM25 scores for documents."""
 
156
  try:
157
  if not documents:
158
  return []
159
 
160
  bm25 = BM25Okapi([doc.split() for doc in documents])
161
  scores = bm25.get_scores(query.split())
162
- return normalize_scores(scores)
 
 
163
 
164
  except Exception as e:
165
- logging.error(f'Error calculating BM25 scores: {e}')
166
  return [0] * len(documents)
167
 
168
  async def calculate_tfidf(query, documents, measure="cosine"):
169
  """Calculate TF-IDF based similarity scores."""
 
170
  try:
171
  if not documents:
172
  return []
173
 
174
- model = SentenceTransformer('BAAI/bge-base-en-v1.5')
 
175
  query_embedding = model.encode(query)
176
  document_embeddings = model.encode(documents)
177
 
178
- # Normalize embeddings
179
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
180
  document_embeddings = document_embeddings / np.linalg.norm(document_embeddings, axis=1)[:, np.newaxis]
181
 
182
  if measure == "cosine":
183
- # Calculate cosine similarity
184
  scores = np.dot(document_embeddings, query_embedding)
185
- return normalize_scores(scores)
 
 
186
  else:
187
  raise ValueError("Unsupported similarity measure.")
188
 
189
  except Exception as e:
190
- logging.error(f'Error calculating TF-IDF scores: {e}')
191
  return [0] * len(documents)
192
 
193
  def combine_scores(bm25_score, tfidf_score, weights=(0.5, 0.5)):
@@ -221,11 +462,9 @@ def get_total_score(scores, scoring_method: ScoringMethod):
221
  return combine_scores(bm25_score, tfidf_score)
222
 
223
  async def generate_summary(query: str, articles: List[Dict[str, Any]], temperature: float = 0.7) -> str:
224
- """
225
- Generate a summary of the articles using Groq's LLama 3.1 8b model.
226
- """
227
  try:
228
- # Format the articles into a structured JSON string
229
  json_input = json.dumps(articles, indent=2)
230
 
231
  system_prompt = """You are Sentinel, a world-class AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
@@ -251,6 +490,7 @@ Instructions:
251
  12. Make sure the answer is not short and is informative.
252
  13. Your response should be detailed, informative, accurate, and directly relevant to the user's query."""
253
 
 
254
  messages = [
255
  {"role": "system", "content": system_prompt},
256
  {"role": "user", "content": user_prompt}
@@ -258,7 +498,7 @@ Instructions:
258
 
259
  response = groq_client.chat.completions.create(
260
  messages=messages,
261
- model="llama-3.1-70b-versatile", # Using LLama 3.1 8b model
262
  max_tokens=5000,
263
  temperature=temperature,
264
  top_p=0.9,
@@ -266,21 +506,23 @@ Instructions:
266
  stream=False
267
  )
268
 
 
269
  return response.choices[0].message.content.strip()
270
 
271
  except Exception as e:
272
- logging.error(f'Error generating summary: {e}')
273
  return f"Error generating summary: {str(e)}"
274
 
275
  class ChatBot:
276
  def __init__(self):
 
277
  self.scoring_method = ScoringMethod.COMBINED
278
  self.num_results = 10
279
  self.max_chars = 10000
280
  self.score_threshold = 0.8
281
  self.temperature = 0.1
282
- self.history = []
283
- self.base_url = "https://shreyas094-searxng-local.hf.space\search"
284
  self.headers = {
285
  "X-Searx-API-Key": "f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5"
286
  }
@@ -298,35 +540,52 @@ class ChatBot:
298
  "ja": "Japanese",
299
  "ko": "Korean"
300
  }
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  async def get_search_results(self,
303
  query: str,
 
304
  num_results: int,
305
  max_chars: int,
306
  score_threshold: float,
307
  temperature: float,
308
- scoring_method_str: str,
309
  selected_engines: List[str],
310
  safe_search: str,
311
  language: str) -> str:
 
312
  try:
 
 
 
 
 
313
  scoring_method_map = {
314
  "BM25": ScoringMethod.BM25,
315
  "TF-IDF": ScoringMethod.TFIDF,
316
  "Combined": ScoringMethod.COMBINED
317
  }
318
- self.scoring_method = scoring_method_map[scoring_method_str]
319
 
320
  safe_search_map = dict(SAFE_SEARCH_OPTIONS)
321
  safe_search_value = safe_search_map.get(safe_search, SafeSearch.MODERATE.value)
322
 
 
 
 
323
  async with aiohttp.ClientSession() as session:
324
- logging.info(f'Using engines: {", ".join(selected_engines)}')
325
- logging.info(f'Parameters: Results={num_results}, Chars={max_chars}, Threshold={score_threshold}, '
326
- f'Temp={temperature}, Method={scoring_method_str}, SafeSearch={safe_search_value}, Language={language}')
327
-
328
  params = {
329
- "q": query,
330
  "format": "json",
331
  "engines": ",".join(selected_engines),
332
  "limit": num_results,
@@ -336,24 +595,30 @@ class ChatBot:
336
  if language != "all":
337
  params["language"] = language
338
 
 
339
  try:
340
  async with session.get(f"{self.base_url}/search", headers=self.headers, params=params) as response:
341
  data = await response.json()
342
  except Exception as e:
 
343
  return f"Error: Could not connect to search service. Please check if SearxNG is running at {self.base_url}. Error: {str(e)}"
344
 
345
  if "results" not in data or not data["results"]:
 
346
  return "No results found."
347
 
348
  results = data["results"][:num_results]
 
349
  valid_results = await scrape_urls_parallel(results, max_chars)
350
 
351
  if not valid_results:
 
352
  return "No valid articles found after scraping."
353
 
354
  results, scraped_data = zip(*valid_results)
355
  contents = [article["content"] for article in scraped_data]
356
 
 
357
  scores = await get_document_scores(query, contents, self.scoring_method)
358
 
359
  scored_articles = []
@@ -381,15 +646,15 @@ class ChatBot:
381
  unique_articles.append(article)
382
 
383
  # Generate summary using Groq API
384
- summary = await generate_summary(query, unique_articles, self.temperature)
385
 
386
- # Update the response format to include new parameters
387
  response = f"**Search Parameters:**\n"
388
  response += f"- Results: {num_results}\n"
389
  response += f"- Max Characters: {max_chars}\n"
390
  response += f"- Score Threshold: {score_threshold}\n"
391
  response += f"- Temperature: {temperature}\n"
392
- response += f"- Scoring Method: {scoring_method_str}\n"
393
  response += f"- Search Engines: {', '.join(selected_engines)}\n"
394
  response += f"- Safe Search: Level {safe_search_value}\n"
395
  response += f"- Language: {self.available_languages.get(language, language)}\n\n"
@@ -404,9 +669,65 @@ class ChatBot:
404
  return response
405
 
406
  except Exception as e:
407
- logging.error(f'Error in search_and_summarize: {e}')
408
  return f"Error occurred: {str(e)}"
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  def chat(self,
411
  message: str,
412
  history: List[List[str]],
@@ -417,15 +738,18 @@ class ChatBot:
417
  scoring_method: str,
418
  engines: List[str],
419
  safe_search: str,
420
- language: str) -> str:
421
- """
422
- Process chat messages and return responses with custom parameters.
423
- """
424
- # Extract language code from the selection (e.g., "en - English" -> "en")
425
  language_code = language.split(" - ")[0]
426
 
427
- response = asyncio.run(self.get_search_results(
 
 
 
428
  message,
 
429
  num_results,
430
  max_chars,
431
  score_threshold,
@@ -433,7 +757,8 @@ class ChatBot:
433
  scoring_method,
434
  engines,
435
  safe_search,
436
- language_code
 
437
  ))
438
  return response
439
 
@@ -442,17 +767,7 @@ def create_gradio_interface() -> gr.Interface:
442
 
443
  # Define language options
444
  language_choices = [
445
- "all", # All languages
446
- "en", # English
447
- "es", # Spanish
448
- "fr", # French
449
- "de", # German
450
- "it", # Italian
451
- "pt", # Portuguese
452
- "ru", # Russian
453
- "zh", # Chinese
454
- "ja", # Japanese
455
- "ko" # Korean
456
  ]
457
 
458
  # Create mapping for language display names
@@ -526,11 +841,21 @@ def create_gradio_interface() -> gr.Interface:
526
  value="all - All Languages",
527
  label="Language",
528
  info="Select the preferred language for search results"
 
 
 
 
 
 
529
  )
530
  ],
531
  additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
 
 
 
532
  chatbot=gr.Chatbot(
533
  show_copy_button=True,
 
534
  layout="bubble",
535
  height=500,
536
  )
@@ -558,6 +883,9 @@ def create_parameter_description():
558
  - **Language**: Preferred language for search results
559
  - All languages: No language restriction
560
  - Specific languages: Filter results to selected language
 
 
 
561
  """
562
 
563
  if __name__ == "__main__":
 
16
  from enum import Enum
17
  from groq import Groq
18
  import os
19
+ from typing import List, Dict, Any, Set, Optional
20
  from dotenv import load_dotenv
21
+ from concurrent.futures import ThreadPoolExecutor
22
+ from datetime import datetime
23
+
24
+ # Configure logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ logger.info("Starting application initialization")
32
 
33
  # Load environment variables from .env file
34
  load_dotenv()
35
+ logger.info("Environment variables loaded")
36
 
37
  # Initialize Groq client
38
  groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
39
+ logger.info("Groq client initialized")
40
 
41
  class ScoringMethod(Enum):
42
  BM25 = "bm25"
43
  TFIDF = "tfidf"
44
  COMBINED = "combined"
45
 
 
46
  class SafeSearch(Enum):
47
  STRICT = 2
48
  MODERATE = 1
49
  NONE = 0
50
 
51
+ class QueryType(Enum):
52
+ KNOWLEDGE_BASE = "knowledge_base"
53
+ WEB_SEARCH = "web_search"
54
+
55
  SAFE_SEARCH_OPTIONS = [
56
  ("Strict (2)", SafeSearch.STRICT.value),
57
  ("Moderate (1)", SafeSearch.MODERATE.value),
58
  ("None (0)", SafeSearch.NONE.value)
59
  ]
60
 
61
+ async def determine_query_type(query: str, chat_history: List[List[str]], temperature: float = 0.1) -> QueryType:
62
+ """
63
+ Determine whether a query should be answered from knowledge base or require web search.
64
+ Now with improved context handling.
65
+ """
66
+ logger.info(f'Determining query type for: {query}')
67
+ try:
68
+ # Format chat history into a more natural conversation format
69
+ formatted_history = []
70
+ for i, (user_msg, assistant_msg) in enumerate(chat_history[-5:], 1): # Last 5 turns
71
+ formatted_history.append(f"Turn {i}:")
72
+ formatted_history.append(f"User: {user_msg}")
73
+ if assistant_msg:
74
+ formatted_history.append(f"Assistant: {assistant_msg}")
75
+
76
+ chat_context = "\n".join(formatted_history)
77
+
78
+ system_prompt = """You are Sentinel, an intelligent AI agent tasked with determining whether a user query requires a web search or can be answered using your existing knowledge base. Your knowledge cutoff date is April 2024, and the current date is November 2024.
79
+
80
+ Rules for Classification:
81
+
82
+ 1. RESPOND WITH ONLY "knowledge_base" OR "web_search" - NO OTHER TEXT
83
+
84
+ 2. Consider conversation context:
85
+ - Look for references to previous turns in the conversation
86
+ - Check if the query is a follow-up to earlier discussion
87
+ - Consider if previous context requires updated information
88
+
89
+ 3. Classify as "web_search" if:
90
+ - Query explicitly asks for current/latest/recent information
91
+ - References events or data after April 2024
92
+ - Requires real-time information (prices, weather, news)
93
+ - Uses words like "current", "latest", "now", "today"
94
+ - Asks about ongoing events or situations
95
+ - Needs verification of recent claims
96
+ - Is a follow-up question about current events
97
+ - Previous context involves recent/ongoing events
98
+
99
+ 4. Classify as "knowledge_base" if:
100
+ - Query is about historical events or facts before April 2024
101
+ - Involves general knowledge, concepts, or theories
102
+ - Is casual conversation or greeting
103
+ - Asks for explanations of established topics
104
+ - Requires logical reasoning or analysis
105
+ - Is about personal opinions or hypotheticals
106
+ - Is a follow-up to a knowledge-base discussion
107
+ - Previous context is about historical or conceptual topics"""
108
+
109
+ messages = [
110
+ {"role": "system", "content": system_prompt},
111
+ {"role": "user", "content": f"Previous conversation:\n{chat_context}\n\nCurrent query: {query}\n\nClassify this query based on the rules above, considering the conversation context."}
112
+ ]
113
+
114
+ response = groq_client.chat.completions.create(
115
+ messages=messages,
116
+ model="llama-3.1-70b-versatile",
117
+ temperature=temperature,
118
+ max_tokens=10,
119
+ stream=False
120
+ )
121
+
122
+ result = response.choices[0].message.content.strip().lower()
123
+ logger.info(f'Query type determined as: {result} with context')
124
+
125
+ return QueryType.WEB_SEARCH if result == "web_search" else QueryType.KNOWLEDGE_BASE
126
+
127
+ except Exception as e:
128
+ logger.error(f'Error determining query type: {e}')
129
+ return QueryType.WEB_SEARCH
130
+
131
+ async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
132
+ """Handle queries that can be answered from the knowledge base, with context."""
133
+ logger.info(f'Processing knowledge base query: {query}')
134
+ try:
135
+ # Format recent conversation history
136
+ formatted_history = []
137
+ for i, (user_msg, assistant_msg) in enumerate(chat_history[-5:], 1):
138
+ formatted_history.append(f"Turn {i}:")
139
+ formatted_history.append(f"User: {user_msg}")
140
+ if assistant_msg:
141
+ formatted_history.append(f"Assistant: {assistant_msg}")
142
+
143
+ chat_context = "\n".join(formatted_history)
144
+
145
+ system_prompt = """You are Sentinel, a highly knowledgeable AI assistant with expertise through April 2024. You provide accurate, informative responses based on your knowledge base while maintaining conversation context.
146
+
147
+ Guidelines:
148
+ 1. Use the conversation history to provide contextually relevant responses
149
+ 2. Reference previous turns when appropriate
150
+ 3. Maintain consistency with previous responses
151
+ 4. Use markdown formatting for better readability
152
+ 5. Be clear about historical facts vs. analysis
153
+ 6. Note if information might be outdated
154
+ 7. Stay within knowledge cutoff date of April 2024
155
+ 8. Be direct and conversational
156
+ 9. Acknowledge and build upon previous context when relevant"""
157
+
158
+ messages = [
159
+ {"role": "system", "content": system_prompt},
160
+ {"role": "user", "content": f"Previous conversation:\n{chat_context}\n\nCurrent query: {query}\n\nProvide a comprehensive response based on your knowledge base and the conversation context."}
161
+ ]
162
+
163
+ response = groq_client.chat.completions.create(
164
+ messages=messages,
165
+ model="llama-3.1-70b-versatile",
166
+ temperature=temperature,
167
+ max_tokens=2000,
168
+ stream=False
169
+ )
170
+
171
+ return response.choices[0].message.content.strip()
172
+
173
+ except Exception as e:
174
+ logger.error(f'Error processing knowledge base query: {e}')
175
+ return f"I apologize, but I encountered an error while processing your query: {str(e)}"
176
+
177
+ async def rephrase_query(chat_history, query, temperature=0.2) -> str:
178
+ """Rephrase the query based on chat history and context."""
179
+ logger.info(f'Rephrasing query: {query}')
180
+ try:
181
+ # Format chat history for context
182
+ formatted_history = []
183
+ for user_msg, assistant_msg in chat_history:
184
+ formatted_history.append({"role": "user", "content": user_msg})
185
+ if assistant_msg: # Only add if there's an assistant message
186
+ formatted_history.append({"role": "assistant", "content": assistant_msg})
187
+
188
+ current_year = datetime.now().year
189
+ system_prompt = """You are a highly intelligent and context-aware query rephrasing assistant. Your task is to rephrase search queries while following these strict rules:
190
+
191
+ 1. Entity Handling:
192
+ - Identify main entities (organizations, brands, products, locations)
193
+ - Enclose ONLY the entity names in double quotes
194
+ - Example: "Apple" stock price, not "Apple stock price"
195
+
196
+ 2. Date Handling Rules (VERY IMPORTANT):
197
+ - For queries about current/latest/recent information:
198
+ * If query contains words like "latest", "current", "recent", "now", "today":
199
+ - Keep these words in the query
200
+ - ALWAYS append "after: YYYY" (current year) at the end
201
+ * Example: "latest news on "Apple"" becomes "latest news on "Apple" after: 2024"
202
+
203
+ - For queries with specific time periods:
204
+ * Keep the original time reference
205
+ * Add appropriate "after: YYYY" based on the mentioned year
206
+ * Example: "How did "Bank of America" perform in Q2 2023" becomes
207
+ "How did "Bank of America" perform in Q2 2023 after: 2023"
208
+
209
+ - For queries without any time reference:
210
+ * ALWAYS append "after: YYYY" (current year) at the end
211
+ * Example: ""Toyota" market share" becomes ""Toyota" market share after: 2024"
212
+
213
+ 3. Output Format:
214
+ - First letter should be capitalized
215
+ - No period at the end
216
+ - Include all specified date operators
217
+ - Maintain the entire original query's meaning and context
218
+
219
+ Remember: EVERY query must end with a date operator unless it explicitly references a past date/year."""
220
+
221
+ # Prepare messages for the API call
222
+ messages = [
223
+ {"role": "system", "content": system_prompt},
224
+ {"role": "user", "content": f"Current year is {current_year}. Rephrase this query: {query}"}
225
+ ]
226
+
227
+ # Call Groq API
228
+ response = groq_client.chat.completions.create(
229
+ messages=messages,
230
+ model="llama-3.1-70b-versatile",
231
+ temperature=temperature,
232
+ max_tokens=200,
233
+ stream=False
234
+ )
235
+
236
+ rephrased_query = response.choices[0].message.content.strip()
237
+ logger.info(f'Query rephrased to: {rephrased_query}')
238
+ return rephrased_query
239
+
240
+ except Exception as e:
241
+ logger.error(f'Error rephrasing query: {e}')
242
+ return query # Return original query if rephrasing fails
243
+
244
+ class ParallelScraper:
245
+ def __init__(self, max_workers: int = 5):
246
+ logger.info(f"Initializing ParallelScraper with {max_workers} workers")
247
+ self.executor = ThreadPoolExecutor(max_workers=max_workers)
248
+ self.session: Optional[aiohttp.ClientSession] = None
249
+
250
+ async def __aenter__(self):
251
+ logger.info("Creating aiohttp session")
252
+ self.session = aiohttp.ClientSession()
253
+ return self
254
+
255
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
256
+ if self.session:
257
+ logger.info("Closing aiohttp session")
258
+ await self.session.close()
259
+
260
+ def parse_article(self, article: Article) -> Dict[str, Any]:
261
+ """Parse a newspaper Article object in a separate thread"""
262
+ try:
263
+ logger.info("Parsing article")
264
+ article.parse()
265
+ return {
266
+ "content": article.text,
267
+ "publish_date": article.publish_date.isoformat() if article.publish_date else None
268
+ }
269
+ except Exception as e:
270
+ logger.error(f'Error parsing article: {e}')
271
+ return None
272
+
273
+ async def download_and_parse_html(self, url: str, max_chars: int) -> Dict[str, Any]:
274
+ """Download and parse HTML content asynchronously"""
275
+ logger.info(f'Processing HTML URL: {url}')
276
+ try:
277
+ article = Article(url)
278
+ await asyncio.get_event_loop().run_in_executor(self.executor, article.download)
279
+ result = await asyncio.get_event_loop().run_in_executor(self.executor, self.parse_article, article)
280
+
281
+ if result:
282
+ result["content"] = result["content"][:max_chars]
283
+ logger.info(f'Successfully processed HTML from {url}')
284
+ return result
285
+ except Exception as e:
286
+ logger.error(f'Error processing HTML from {url}: {e}')
287
+ return None
288
+
289
+ async def download_and_parse_pdf(self, url: str, max_chars: int) -> Dict[str, Any]:
290
+ """Download and parse PDF content asynchronously"""
291
+ logger.info(f'Processing PDF URL: {url}')
292
+ try:
293
+ if not self.session:
294
+ raise RuntimeError("Session not initialized")
295
+
296
+ async with self.session.get(url) as response:
297
+ pdf_bytes = await response.read()
298
+
299
+ def process_pdf():
300
+ logger.info("Processing PDF content")
301
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
302
+ text = ""
303
+ for page in pdf_reader.pages:
304
+ text += page.extract_text()
305
+ return text[:max_chars]
306
+
307
+ text = await asyncio.get_event_loop().run_in_executor(self.executor, process_pdf)
308
+ logger.info(f'Successfully processed PDF from {url}')
309
+ return {"content": text, "publish_date": None}
310
+ except Exception as e:
311
+ logger.error(f'Error processing PDF from {url}: {e}')
312
+ return None
313
+
314
+ async def scrape_url(self, url: str, max_chars: int) -> Dict[str, Any]:
315
+ """Scrape content from a URL, handling both HTML and PDF formats"""
316
+ logger.info(f'Starting to scrape URL: {url}')
317
+ if url.endswith('.pdf'):
318
+ return await self.download_and_parse_pdf(url, max_chars)
319
+ else:
320
+ return await self.download_and_parse_html(url, max_chars)
321
+
322
+ async def scrape_urls(self, urls: list, max_chars: int) -> list:
323
+ """Scrape multiple URLs in parallel"""
324
+ logger.info(f'Starting parallel scraping of {len(urls)} URLs')
325
+ tasks = [self.scrape_url(url, max_chars) for url in urls]
326
+ return await asyncio.gather(*tasks)
327
+
328
+ async def scrape_urls_parallel(results: list, max_chars: int) -> list:
329
+ """Scrape multiple URLs in parallel using the ParallelScraper"""
330
+ logger.info(f'Initializing parallel scraping for {len(results)} results')
331
+ async with ParallelScraper() as scraper:
332
+ urls = [result["url"] for result in results]
333
+ scraped_data = await scraper.scrape_urls(urls, max_chars)
334
+
335
+ # Combine results with scraped data
336
+ valid_results = []
337
+ for result, article in zip(results, scraped_data):
338
+ if article is not None:
339
+ valid_results.append((result, article))
340
+
341
+ logger.info(f'Successfully scraped {len(valid_results)} valid results')
342
+ return valid_results
343
+
344
  async def get_available_engines(session, base_url, headers):
345
  """Fetch available search engines from SearxNG instance."""
346
+ logger.info("Fetching available search engines")
347
  try:
 
348
  params = {
349
  "q": "test",
350
  "format": "json",
 
353
  async with session.get(f"{base_url}/search", headers=headers, params=params) as response:
354
  data = await response.json()
355
  available_engines = set()
 
356
  if "search" in data:
357
  for engine_data in data["search"]:
358
  if isinstance(engine_data, dict) and "engine" in engine_data:
359
  available_engines.add(engine_data["engine"])
360
 
 
361
  if not available_engines:
362
  async with session.get(f"{base_url}/engines", headers=headers) as response:
363
  engines_data = await response.json()
364
  available_engines = set(engine["name"] for engine in engines_data if engine.get("enabled", True))
365
 
366
+ logger.info(f'Found {len(available_engines)} available engines')
367
  return list(available_engines)
368
  except Exception as e:
369
+ logger.error(f'Error fetching search engines: {e}')
 
370
  return ["google", "bing", "duckduckgo", "brave", "wikipedia"]
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  def normalize_scores(scores):
373
  """Normalize scores to [0, 1] range using min-max normalization"""
374
  if not isinstance(scores, np.ndarray):
 
389
 
390
  async def calculate_bm25(query, documents):
391
  """Calculate BM25 scores for documents."""
392
+ logger.info("Calculating BM25 scores")
393
  try:
394
  if not documents:
395
  return []
396
 
397
  bm25 = BM25Okapi([doc.split() for doc in documents])
398
  scores = bm25.get_scores(query.split())
399
+ normalized_scores = normalize_scores(scores)
400
+ logger.info("BM25 scores calculated successfully")
401
+ return normalized_scores
402
 
403
  except Exception as e:
404
+ logger.error(f'Error calculating BM25 scores: {e}')
405
  return [0] * len(documents)
406
 
407
  async def calculate_tfidf(query, documents, measure="cosine"):
408
  """Calculate TF-IDF based similarity scores."""
409
+ logger.info("Calculating TF-IDF scores")
410
  try:
411
  if not documents:
412
  return []
413
 
414
+ model = SentenceTransformer('all-MiniLM-L6-v2')
415
+ logger.info("Encoding query and documents")
416
  query_embedding = model.encode(query)
417
  document_embeddings = model.encode(documents)
418
 
 
419
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
420
  document_embeddings = document_embeddings / np.linalg.norm(document_embeddings, axis=1)[:, np.newaxis]
421
 
422
  if measure == "cosine":
 
423
  scores = np.dot(document_embeddings, query_embedding)
424
+ normalized_scores = normalize_scores(scores)
425
+ logger.info("TF-IDF scores calculated successfully")
426
+ return normalized_scores
427
  else:
428
  raise ValueError("Unsupported similarity measure.")
429
 
430
  except Exception as e:
431
+ logger.error(f'Error calculating TF-IDF scores: {e}')
432
  return [0] * len(documents)
433
 
434
  def combine_scores(bm25_score, tfidf_score, weights=(0.5, 0.5)):
 
462
  return combine_scores(bm25_score, tfidf_score)
463
 
464
  async def generate_summary(query: str, articles: List[Dict[str, Any]], temperature: float = 0.7) -> str:
465
+ """Generate a summary of the articles using Groq's LLama 3.1 8b model."""
466
+ logger.info(f'Generating summary for query: {query}')
 
467
  try:
 
468
  json_input = json.dumps(articles, indent=2)
469
 
470
  system_prompt = """You are Sentinel, a world-class AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
 
490
  12. Make sure the answer is not short and is informative.
491
  13. Your response should be detailed, informative, accurate, and directly relevant to the user's query."""
492
 
493
+ logger.info("Sending request to Groq API")
494
  messages = [
495
  {"role": "system", "content": system_prompt},
496
  {"role": "user", "content": user_prompt}
 
498
 
499
  response = groq_client.chat.completions.create(
500
  messages=messages,
501
+ model="llama-3.1-70b-versatile",
502
  max_tokens=5000,
503
  temperature=temperature,
504
  top_p=0.9,
 
506
  stream=False
507
  )
508
 
509
+ logger.info("Summary generated successfully")
510
  return response.choices[0].message.content.strip()
511
 
512
  except Exception as e:
513
+ logger.error(f'Error generating summary: {e}')
514
  return f"Error generating summary: {str(e)}"
515
 
516
  class ChatBot:
517
  def __init__(self):
518
+ logger.info("Initializing ChatBot")
519
  self.scoring_method = ScoringMethod.COMBINED
520
  self.num_results = 10
521
  self.max_chars = 10000
522
  self.score_threshold = 0.8
523
  self.temperature = 0.1
524
+ self.conversation_history = []
525
+ self.base_url = "https://shreyas094-searxng-local.hf.space"
526
  self.headers = {
527
  "X-Searx-API-Key": "f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5"
528
  }
 
540
  "ja": "Japanese",
541
  "ko": "Korean"
542
  }
543
+ logger.info("ChatBot initialized successfully")
544
+
545
+ def format_chat_history(self, history: List[List[str]]) -> str:
546
+ """Format chat history into a readable string with clear turn markers."""
547
+ formatted_history = []
548
+ for i, (user_msg, assistant_msg) in enumerate(history, 1):
549
+ formatted_history.append(f"Turn {i}:")
550
+ formatted_history.append(f"User: {user_msg}")
551
+ if assistant_msg:
552
+ formatted_history.append(f"Assistant: {assistant_msg}")
553
+ return "\n".join(formatted_history)
554
 
555
  async def get_search_results(self,
556
  query: str,
557
+ history: List[List[str]],
558
  num_results: int,
559
  max_chars: int,
560
  score_threshold: float,
561
  temperature: float,
562
+ scoring_method: str,
563
  selected_engines: List[str],
564
  safe_search: str,
565
  language: str) -> str:
566
+ logger.info(f'Processing search request for query: {query}')
567
  try:
568
+ # First, rephrase the query using chat history
569
+ rephrased_query = await rephrase_query(history, query, temperature=0.2)
570
+ logger.info(f'Original query: {query}')
571
+ logger.info(f'Rephrased query: {rephrased_query}')
572
+
573
  scoring_method_map = {
574
  "BM25": ScoringMethod.BM25,
575
  "TF-IDF": ScoringMethod.TFIDF,
576
  "Combined": ScoringMethod.COMBINED
577
  }
578
+ self.scoring_method = scoring_method_map[scoring_method]
579
 
580
  safe_search_map = dict(SAFE_SEARCH_OPTIONS)
581
  safe_search_value = safe_search_map.get(safe_search, SafeSearch.MODERATE.value)
582
 
583
+ logger.info(f'Search parameters - Engines: {selected_engines}, Results: {num_results}, Method: {scoring_method}')
584
+
585
+ # Use the rephrased query for the search
586
  async with aiohttp.ClientSession() as session:
 
 
 
 
587
  params = {
588
+ "q": rephrased_query, # Use rephrased query here
589
  "format": "json",
590
  "engines": ",".join(selected_engines),
591
  "limit": num_results,
 
595
  if language != "all":
596
  params["language"] = language
597
 
598
+ logger.info("Sending search request to SearxNG")
599
  try:
600
  async with session.get(f"{self.base_url}/search", headers=self.headers, params=params) as response:
601
  data = await response.json()
602
  except Exception as e:
603
+ logger.error(f'SearxNG connection error: {e}')
604
  return f"Error: Could not connect to search service. Please check if SearxNG is running at {self.base_url}. Error: {str(e)}"
605
 
606
  if "results" not in data or not data["results"]:
607
+ logger.info("No search results found")
608
  return "No results found."
609
 
610
  results = data["results"][:num_results]
611
+ logger.info(f'Processing {len(results)} search results')
612
  valid_results = await scrape_urls_parallel(results, max_chars)
613
 
614
  if not valid_results:
615
+ logger.info("No valid articles found after scraping")
616
  return "No valid articles found after scraping."
617
 
618
  results, scraped_data = zip(*valid_results)
619
  contents = [article["content"] for article in scraped_data]
620
 
621
+ logger.info("Calculating document scores")
622
  scores = await get_document_scores(query, contents, self.scoring_method)
623
 
624
  scored_articles = []
 
646
  unique_articles.append(article)
647
 
648
  # Generate summary using Groq API
649
+ summary = await generate_summary(query, unique_articles, temperature)
650
 
651
+ # Update the response format to use scoring_method instead of scoring_method_str
652
  response = f"**Search Parameters:**\n"
653
  response += f"- Results: {num_results}\n"
654
  response += f"- Max Characters: {max_chars}\n"
655
  response += f"- Score Threshold: {score_threshold}\n"
656
  response += f"- Temperature: {temperature}\n"
657
+ response += f"- Scoring Method: {scoring_method}\n" # Updated this line
658
  response += f"- Search Engines: {', '.join(selected_engines)}\n"
659
  response += f"- Safe Search: Level {safe_search_value}\n"
660
  response += f"- Language: {self.available_languages.get(language, language)}\n\n"
 
669
  return response
670
 
671
  except Exception as e:
672
+ logger.error(f'Error in search_and_summarize: {e}')
673
  return f"Error occurred: {str(e)}"
674
 
675
+ async def get_response(self,
676
+ query: str,
677
+ history: List[List[str]],
678
+ num_results: int,
679
+ max_chars: int,
680
+ score_threshold: float,
681
+ temperature: float,
682
+ scoring_method: str,
683
+ selected_engines: List[str],
684
+ safe_search: str,
685
+ language: str,
686
+ force_web_search: bool = False) -> str:
687
+ """Determine query type and route to appropriate handler with context."""
688
+ logger.info(f'Processing query: {query}')
689
+ try:
690
+ # Update conversation history
691
+ formatted_history = self.format_chat_history(history)
692
+ logger.info(f'Current conversation context:\n{formatted_history}')
693
+
694
+ # If force_web_search is True, skip query type determination
695
+ if force_web_search:
696
+ logger.info('Force web search mode enabled - bypassing query type determination')
697
+ query_type = QueryType.WEB_SEARCH
698
+ else:
699
+ # Determine query type with context
700
+ query_type = await determine_query_type(query, history, temperature)
701
+
702
+ if query_type == QueryType.KNOWLEDGE_BASE and not force_web_search:
703
+ logger.info('Using knowledge base to answer query')
704
+ response = await process_knowledge_base_query(
705
+ query=query,
706
+ chat_history=history,
707
+ temperature=temperature
708
+ )
709
+ else:
710
+ logger.info('Using web search to answer query')
711
+ response = await self.get_search_results(
712
+ query=query,
713
+ history=history,
714
+ num_results=num_results,
715
+ max_chars=max_chars,
716
+ score_threshold=score_threshold,
717
+ temperature=temperature,
718
+ scoring_method=scoring_method,
719
+ selected_engines=selected_engines,
720
+ safe_search=safe_search,
721
+ language=language
722
+ )
723
+
724
+ logger.info(f'Generated response type: {query_type}')
725
+ return response
726
+
727
+ except Exception as e:
728
+ logger.error(f'Error in get_response: {e}')
729
+ return f"I apologize, but I encountered an error: {str(e)}"
730
+
731
  def chat(self,
732
  message: str,
733
  history: List[List[str]],
 
738
  scoring_method: str,
739
  engines: List[str],
740
  safe_search: str,
741
+ language: str,
742
+ force_web_search: bool) -> str:
743
+ """Process chat messages with context and return responses."""
744
+ # Extract language code and process response
 
745
  language_code = language.split(" - ")[0]
746
 
747
+ # Update conversation history from the Gradio history
748
+ self.conversation_history = history
749
+
750
+ response = asyncio.run(self.get_response(
751
  message,
752
+ self.conversation_history,
753
  num_results,
754
  max_chars,
755
  score_threshold,
 
757
  scoring_method,
758
  engines,
759
  safe_search,
760
+ language_code,
761
+ force_web_search
762
  ))
763
  return response
764
 
 
767
 
768
  # Define language options
769
  language_choices = [
770
+ "all", "en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"
 
 
 
 
 
 
 
 
 
 
771
  ]
772
 
773
  # Create mapping for language display names
 
841
  value="all - All Languages",
842
  label="Language",
843
  info="Select the preferred language for search results"
844
+ ),
845
+ gr.Radio(
846
+ choices=["Auto (Knowledge Base + Web)", "Web Search Only"],
847
+ value="Auto (Knowledge Base + Web)",
848
+ label="Search Mode",
849
+ info="Choose whether to use both knowledge base and web search, or force web search only"
850
  )
851
  ],
852
  additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
853
+ retry_btn="Retry",
854
+ undo_btn="Undo",
855
+ clear_btn="Clear",
856
  chatbot=gr.Chatbot(
857
  show_copy_button=True,
858
+ likeable=True,
859
  layout="bubble",
860
  height=500,
861
  )
 
883
  - **Language**: Preferred language for search results
884
  - All languages: No language restriction
885
  - Specific languages: Filter results to selected language
886
+ - **Search Mode**: Control how queries are processed
887
+ - Auto: Automatically choose between knowledge base and web search
888
+ - Web Search Only: Always use web search regardless of query type
889
  """
890
 
891
  if __name__ == "__main__":