Shreyas094 commited on
Commit
4ea7d03
·
verified ·
1 Parent(s): 986478e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -70
app.py CHANGED
@@ -20,6 +20,8 @@ from typing import List, Dict, Any, Set, Optional
20
  from dotenv import load_dotenv
21
  from concurrent.futures import ThreadPoolExecutor
22
  from datetime import datetime
 
 
23
 
24
  # Configure logging
25
  logging.basicConfig(
@@ -127,6 +129,19 @@ Classify as "knowledge_base" if the query:
127
  logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
128
  return QueryType.KNOWLEDGE_BASE
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
131
  """Handle queries that can be answered from the knowledge base, with context."""
132
  logger.info(f'Processing knowledge base query: {query}')
@@ -174,23 +189,10 @@ Guidelines:
174
  return f"I apologize, but I encountered an error while processing your query: {str(e)}"
175
 
176
  async def rephrase_query(chat_history, query, temperature=0.2) -> str:
177
- """Rephrase the query based on chat history and context while preserving URLs."""
178
  logger.info(f'Rephrasing query: {query}')
179
 
180
  try:
181
- # Extract URLs from the query
182
- url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
183
- urls = re.findall(url_pattern, query)
184
-
185
- # If URLs are found, store them and replace with placeholders
186
- url_placeholders = {}
187
- modified_query = query
188
- if urls:
189
- for idx, url in enumerate(urls):
190
- placeholder = f"__URL_{idx}__"
191
- url_placeholders[placeholder] = url
192
- modified_query = modified_query.replace(url, placeholder)
193
-
194
  # Format recent conversation history (last 3 turns for context)
195
  formatted_history = []
196
  for i, (user_msg, assistant_msg) in enumerate(chat_history[-3:], 1):
@@ -206,41 +208,42 @@ async def rephrase_query(chat_history, query, temperature=0.2) -> str:
206
 
207
  Key Rules:
208
  1. For follow-up questions or queries referencing previous conversation:
209
- - Extract the main topic/subject from previous messages
210
- - Combine previous context with the current query
211
- - Example: Previous: "What is the structure of German banking industry?"
212
- Current: "can you do more latest web search on my previous query"
213
- Should become: "Latest structure and developments in German banking industry after: 2024"
214
 
215
  2. Entity Handling:
216
- - Identify and preserve main entities from context
217
- - Enclose ONLY entity names in double quotes
218
- - Example: "Deutsche Bank" profits, not "Deutsche Bank profits"
219
- - Preserve URL placeholders exactly as they appear (marked with __URL_N__)
220
 
221
  3. Date and Time Context:
222
- - For queries about current/latest information:
223
- * Keep time-related words (latest, current, recent, now)
224
- * ALWAYS append "after: YYYY" (current year)
225
- - For specific time periods:
226
- * Preserve the original time reference
227
- * Add appropriate "after: YYYY" based on context
228
- - For queries without time reference:
229
- * Add "after: YYYY" if about current state/status
230
 
231
  4. Query Formatting:
232
- - Capitalize first letter
233
- - No period at end
234
- - Include all relevant context
235
- - Maintain clear and searchable structure
236
- - IMPORTANT: Keep URL placeholders (__URL_N__) exactly as they appear"""
 
237
 
238
  messages = [
239
  {"role": "system", "content": system_prompt},
240
  {"role": "user", "content": f"""Current year: {current_year}
241
  Recent conversation history:
242
  {chat_context}
243
- Current query: {modified_query}
 
244
  Please rephrase this query into a complete, contextual search query following the rules above. The rephrased query should be clear and complete even without the conversation context."""}
245
  ]
246
 
@@ -251,18 +254,14 @@ Please rephrase this query into a complete, contextual search query following th
251
  max_tokens=200,
252
  stream=False
253
  )
254
-
255
  rephrased_query = response.choices[0].message.content.strip()
256
-
257
- # Replace placeholders with original URLs
258
- for placeholder, url in url_placeholders.items():
259
- rephrased_query = rephrased_query.replace(placeholder, url)
260
-
261
  logger.info(f'Query rephrased to: {rephrased_query}')
262
  return rephrased_query
263
 
264
  except Exception as e:
265
  logger.error(f'Error rephrasing query: {e}')
 
266
  # If rephrasing fails, construct a basic contextual query
267
  try:
268
  last_query = chat_history[-1][0] if chat_history else ""
@@ -583,6 +582,7 @@ class ChatBot:
583
  formatted_history.append(f"Assistant: {assistant_msg}")
584
  return "\n".join(formatted_history)
585
 
 
586
  async def get_search_results(self,
587
  query: str,
588
  history: List[List[str]],
@@ -704,46 +704,59 @@ class ChatBot:
704
  return f"Error occurred: {str(e)}"
705
 
706
  async def get_response(self,
707
- query: str,
708
- history: List[List[str]],
709
- num_results: int,
710
- max_chars: int,
711
- score_threshold: float,
712
- temperature: float,
713
- scoring_method: str,
714
- selected_engines: List[str],
715
- safe_search: str,
716
- language: str,
717
- force_web_search: bool = False) -> str:
718
- """Determine query type and route to appropriate handler with context."""
719
  logger.info(f'Processing query: {query}')
720
  try:
721
- # Update conversation history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  formatted_history = self.format_chat_history(history)
723
- logger.info(f'Current conversation context:\n{formatted_history}')
724
-
725
- # Convert the force_web_search radio button value to boolean
726
  force_web_search = force_web_search == "Web Search Only"
727
- logger.info(f'Force web search mode: {force_web_search}')
728
-
729
- # If force_web_search is True, skip query type determination
730
  if force_web_search:
731
- logger.info('Force web search mode enabled - bypassing query type determination')
732
  query_type = QueryType.WEB_SEARCH
733
  else:
734
- # Determine query type with context
735
  query_type = await determine_query_type(query, history, temperature)
736
- logger.info(f'Query type determined as: {query_type}')
737
 
738
  if query_type == QueryType.KNOWLEDGE_BASE and not force_web_search:
739
- logger.info('Using knowledge base to answer query')
740
  response = await process_knowledge_base_query(
741
  query=query,
742
  chat_history=history,
743
  temperature=temperature
744
  )
745
  else:
746
- logger.info('Using web search to answer query')
747
  response = await self.get_search_results(
748
  query=query,
749
  history=history,
@@ -756,10 +769,9 @@ class ChatBot:
756
  safe_search=safe_search,
757
  language=language
758
  )
759
-
760
- logger.info(f'Generated response type: {query_type}')
761
  return response
762
-
763
  except Exception as e:
764
  logger.error(f'Error in get_response: {e}')
765
  return f"I apologize, but I encountered an error: {str(e)}"
 
20
  from dotenv import load_dotenv
21
  from concurrent.futures import ThreadPoolExecutor
22
  from datetime import datetime
23
+ import re
24
+ from urllib.parse import urlparse
25
 
26
  # Configure logging
27
  logging.basicConfig(
 
129
  logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
130
  return QueryType.KNOWLEDGE_BASE
131
 
132
+ def is_valid_url(url: str) -> bool:
133
+ """Check if the provided string is a valid URL."""
134
+ try:
135
+ result = urlparse(url)
136
+ return all([result.scheme, result.netloc])
137
+ except:
138
+ return False
139
+
140
+ def extract_urls(text: str) -> List[str]:
141
+ """Extract URLs from text using regex pattern."""
142
+ url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
143
+ return re.findall(url_pattern, text)
144
+
145
  async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
146
  """Handle queries that can be answered from the knowledge base, with context."""
147
  logger.info(f'Processing knowledge base query: {query}')
 
189
  return f"I apologize, but I encountered an error while processing your query: {str(e)}"
190
 
191
  async def rephrase_query(chat_history, query, temperature=0.2) -> str:
192
+ """Rephrase the query based on chat history and context."""
193
  logger.info(f'Rephrasing query: {query}')
194
 
195
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  # Format recent conversation history (last 3 turns for context)
197
  formatted_history = []
198
  for i, (user_msg, assistant_msg) in enumerate(chat_history[-3:], 1):
 
208
 
209
  Key Rules:
210
  1. For follow-up questions or queries referencing previous conversation:
211
+ - Extract the main topic/subject from previous messages
212
+ - Combine previous context with the current query
213
+ - Example: Previous: "What is the structure of German banking industry?"
214
+ Current: "can you do more latest web search on my previous query"
215
+ Should become: "Latest structure and developments in German banking industry after: 2024"
216
 
217
  2. Entity Handling:
218
+ - Identify and preserve main entities from context
219
+ - Enclose ONLY entity names in double quotes
220
+ - Example: "Deutsche Bank" profits, not "Deutsche Bank profits"
 
221
 
222
  3. Date and Time Context:
223
+ - For queries about current/latest information:
224
+ * Keep time-related words (latest, current, recent, now)
225
+ * ALWAYS append "after: YYYY" (current year)
226
+ - For specific time periods:
227
+ * Preserve the original time reference
228
+ * Add appropriate "after: YYYY" based on context
229
+ - For queries without time reference:
230
+ * Add "after: YYYY" if about current state/status
231
 
232
  4. Query Formatting:
233
+ - Capitalize first letter
234
+ - No period at end
235
+ - Include all relevant context
236
+ - Maintain clear and searchable structure
237
+
238
+ Remember: Your goal is to create a complete, self-contained query that includes all necessary context from the conversation history."""
239
 
240
  messages = [
241
  {"role": "system", "content": system_prompt},
242
  {"role": "user", "content": f"""Current year: {current_year}
243
  Recent conversation history:
244
  {chat_context}
245
+ Current query: {query}
246
+
247
  Please rephrase this query into a complete, contextual search query following the rules above. The rephrased query should be clear and complete even without the conversation context."""}
248
  ]
249
 
 
254
  max_tokens=200,
255
  stream=False
256
  )
257
+
258
  rephrased_query = response.choices[0].message.content.strip()
 
 
 
 
 
259
  logger.info(f'Query rephrased to: {rephrased_query}')
260
  return rephrased_query
261
 
262
  except Exception as e:
263
  logger.error(f'Error rephrasing query: {e}')
264
+
265
  # If rephrasing fails, construct a basic contextual query
266
  try:
267
  last_query = chat_history[-1][0] if chat_history else ""
 
582
  formatted_history.append(f"Assistant: {assistant_msg}")
583
  return "\n".join(formatted_history)
584
 
585
+
586
  async def get_search_results(self,
587
  query: str,
588
  history: List[List[str]],
 
704
  return f"Error occurred: {str(e)}"
705
 
706
  async def get_response(self,
707
+ query: str,
708
+ history: List[List[str]],
709
+ num_results: int,
710
+ max_chars: int,
711
+ score_threshold: float,
712
+ temperature: float,
713
+ scoring_method: str,
714
+ selected_engines: List[str],
715
+ safe_search: str,
716
+ language: str,
717
+ force_web_search: bool = False) -> str:
718
+ """Enhanced get_response method with URL scraping capability."""
719
  logger.info(f'Processing query: {query}')
720
  try:
721
+ # Extract URLs from the query
722
+ urls = extract_urls(query)
723
+
724
+ # If valid URLs are found in the query, directly scrape them
725
+ if urls:
726
+ logger.info(f'Found URLs in query: {urls}')
727
+ articles = await self.scrape_specific_urls(urls, max_chars)
728
+
729
+ if not articles:
730
+ return "I couldn't extract valid content from the provided URLs. Please check if the URLs are accessible."
731
+
732
+ # Generate summary using only the scraped content
733
+ summary = await generate_summary(query, articles, temperature)
734
+
735
+ # Format response
736
+ response = "**Direct URL Scraping Results:**\n\n"
737
+ response += summary + "\n\n"
738
+ response += "**Scraped URLs:**\n"
739
+ for i, article in enumerate(articles, 1):
740
+ response += f"{i}. [{urlparse(article['url']).netloc}]({article['url']})\n"
741
+
742
+ return response
743
+
744
+ # If no URLs found, proceed with regular query processing
745
  formatted_history = self.format_chat_history(history)
 
 
 
746
  force_web_search = force_web_search == "Web Search Only"
747
+
 
 
748
  if force_web_search:
 
749
  query_type = QueryType.WEB_SEARCH
750
  else:
 
751
  query_type = await determine_query_type(query, history, temperature)
 
752
 
753
  if query_type == QueryType.KNOWLEDGE_BASE and not force_web_search:
 
754
  response = await process_knowledge_base_query(
755
  query=query,
756
  chat_history=history,
757
  temperature=temperature
758
  )
759
  else:
 
760
  response = await self.get_search_results(
761
  query=query,
762
  history=history,
 
769
  safe_search=safe_search,
770
  language=language
771
  )
772
+
 
773
  return response
774
+
775
  except Exception as e:
776
  logger.error(f'Error in get_response: {e}')
777
  return f"I apologize, but I encountered an error: {str(e)}"