Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,6 +20,8 @@ from typing import List, Dict, Any, Set, Optional
|
|
20 |
from dotenv import load_dotenv
|
21 |
from concurrent.futures import ThreadPoolExecutor
|
22 |
from datetime import datetime
|
|
|
|
|
23 |
|
24 |
# Configure logging
|
25 |
logging.basicConfig(
|
@@ -127,6 +129,19 @@ Classify as "knowledge_base" if the query:
|
|
127 |
logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
|
128 |
return QueryType.KNOWLEDGE_BASE
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
|
131 |
"""Handle queries that can be answered from the knowledge base, with context."""
|
132 |
logger.info(f'Processing knowledge base query: {query}')
|
@@ -174,23 +189,10 @@ Guidelines:
|
|
174 |
return f"I apologize, but I encountered an error while processing your query: {str(e)}"
|
175 |
|
176 |
async def rephrase_query(chat_history, query, temperature=0.2) -> str:
|
177 |
-
"""Rephrase the query based on chat history and context
|
178 |
logger.info(f'Rephrasing query: {query}')
|
179 |
|
180 |
try:
|
181 |
-
# Extract URLs from the query
|
182 |
-
url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
|
183 |
-
urls = re.findall(url_pattern, query)
|
184 |
-
|
185 |
-
# If URLs are found, store them and replace with placeholders
|
186 |
-
url_placeholders = {}
|
187 |
-
modified_query = query
|
188 |
-
if urls:
|
189 |
-
for idx, url in enumerate(urls):
|
190 |
-
placeholder = f"__URL_{idx}__"
|
191 |
-
url_placeholders[placeholder] = url
|
192 |
-
modified_query = modified_query.replace(url, placeholder)
|
193 |
-
|
194 |
# Format recent conversation history (last 3 turns for context)
|
195 |
formatted_history = []
|
196 |
for i, (user_msg, assistant_msg) in enumerate(chat_history[-3:], 1):
|
@@ -206,41 +208,42 @@ async def rephrase_query(chat_history, query, temperature=0.2) -> str:
|
|
206 |
|
207 |
Key Rules:
|
208 |
1. For follow-up questions or queries referencing previous conversation:
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
|
215 |
2. Entity Handling:
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
- Preserve URL placeholders exactly as they appear (marked with __URL_N__)
|
220 |
|
221 |
3. Date and Time Context:
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
|
231 |
4. Query Formatting:
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
237 |
|
238 |
messages = [
|
239 |
{"role": "system", "content": system_prompt},
|
240 |
{"role": "user", "content": f"""Current year: {current_year}
|
241 |
Recent conversation history:
|
242 |
{chat_context}
|
243 |
-
Current query: {
|
|
|
244 |
Please rephrase this query into a complete, contextual search query following the rules above. The rephrased query should be clear and complete even without the conversation context."""}
|
245 |
]
|
246 |
|
@@ -251,18 +254,14 @@ Please rephrase this query into a complete, contextual search query following th
|
|
251 |
max_tokens=200,
|
252 |
stream=False
|
253 |
)
|
254 |
-
|
255 |
rephrased_query = response.choices[0].message.content.strip()
|
256 |
-
|
257 |
-
# Replace placeholders with original URLs
|
258 |
-
for placeholder, url in url_placeholders.items():
|
259 |
-
rephrased_query = rephrased_query.replace(placeholder, url)
|
260 |
-
|
261 |
logger.info(f'Query rephrased to: {rephrased_query}')
|
262 |
return rephrased_query
|
263 |
|
264 |
except Exception as e:
|
265 |
logger.error(f'Error rephrasing query: {e}')
|
|
|
266 |
# If rephrasing fails, construct a basic contextual query
|
267 |
try:
|
268 |
last_query = chat_history[-1][0] if chat_history else ""
|
@@ -583,6 +582,7 @@ class ChatBot:
|
|
583 |
formatted_history.append(f"Assistant: {assistant_msg}")
|
584 |
return "\n".join(formatted_history)
|
585 |
|
|
|
586 |
async def get_search_results(self,
|
587 |
query: str,
|
588 |
history: List[List[str]],
|
@@ -704,46 +704,59 @@ class ChatBot:
|
|
704 |
return f"Error occurred: {str(e)}"
|
705 |
|
706 |
async def get_response(self,
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
"""
|
719 |
logger.info(f'Processing query: {query}')
|
720 |
try:
|
721 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
formatted_history = self.format_chat_history(history)
|
723 |
-
logger.info(f'Current conversation context:\n{formatted_history}')
|
724 |
-
|
725 |
-
# Convert the force_web_search radio button value to boolean
|
726 |
force_web_search = force_web_search == "Web Search Only"
|
727 |
-
|
728 |
-
|
729 |
-
# If force_web_search is True, skip query type determination
|
730 |
if force_web_search:
|
731 |
-
logger.info('Force web search mode enabled - bypassing query type determination')
|
732 |
query_type = QueryType.WEB_SEARCH
|
733 |
else:
|
734 |
-
# Determine query type with context
|
735 |
query_type = await determine_query_type(query, history, temperature)
|
736 |
-
logger.info(f'Query type determined as: {query_type}')
|
737 |
|
738 |
if query_type == QueryType.KNOWLEDGE_BASE and not force_web_search:
|
739 |
-
logger.info('Using knowledge base to answer query')
|
740 |
response = await process_knowledge_base_query(
|
741 |
query=query,
|
742 |
chat_history=history,
|
743 |
temperature=temperature
|
744 |
)
|
745 |
else:
|
746 |
-
logger.info('Using web search to answer query')
|
747 |
response = await self.get_search_results(
|
748 |
query=query,
|
749 |
history=history,
|
@@ -756,10 +769,9 @@ class ChatBot:
|
|
756 |
safe_search=safe_search,
|
757 |
language=language
|
758 |
)
|
759 |
-
|
760 |
-
logger.info(f'Generated response type: {query_type}')
|
761 |
return response
|
762 |
-
|
763 |
except Exception as e:
|
764 |
logger.error(f'Error in get_response: {e}')
|
765 |
return f"I apologize, but I encountered an error: {str(e)}"
|
|
|
20 |
from dotenv import load_dotenv
|
21 |
from concurrent.futures import ThreadPoolExecutor
|
22 |
from datetime import datetime
|
23 |
+
import re
|
24 |
+
from urllib.parse import urlparse
|
25 |
|
26 |
# Configure logging
|
27 |
logging.basicConfig(
|
|
|
129 |
logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
|
130 |
return QueryType.KNOWLEDGE_BASE
|
131 |
|
132 |
+
def is_valid_url(url: str) -> bool:
|
133 |
+
"""Check if the provided string is a valid URL."""
|
134 |
+
try:
|
135 |
+
result = urlparse(url)
|
136 |
+
return all([result.scheme, result.netloc])
|
137 |
+
except:
|
138 |
+
return False
|
139 |
+
|
140 |
+
def extract_urls(text: str) -> List[str]:
|
141 |
+
"""Extract URLs from text using regex pattern."""
|
142 |
+
url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
|
143 |
+
return re.findall(url_pattern, text)
|
144 |
+
|
145 |
async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
|
146 |
"""Handle queries that can be answered from the knowledge base, with context."""
|
147 |
logger.info(f'Processing knowledge base query: {query}')
|
|
|
189 |
return f"I apologize, but I encountered an error while processing your query: {str(e)}"
|
190 |
|
191 |
async def rephrase_query(chat_history, query, temperature=0.2) -> str:
|
192 |
+
"""Rephrase the query based on chat history and context."""
|
193 |
logger.info(f'Rephrasing query: {query}')
|
194 |
|
195 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
# Format recent conversation history (last 3 turns for context)
|
197 |
formatted_history = []
|
198 |
for i, (user_msg, assistant_msg) in enumerate(chat_history[-3:], 1):
|
|
|
208 |
|
209 |
Key Rules:
|
210 |
1. For follow-up questions or queries referencing previous conversation:
|
211 |
+
- Extract the main topic/subject from previous messages
|
212 |
+
- Combine previous context with the current query
|
213 |
+
- Example: Previous: "What is the structure of German banking industry?"
|
214 |
+
Current: "can you do more latest web search on my previous query"
|
215 |
+
Should become: "Latest structure and developments in German banking industry after: 2024"
|
216 |
|
217 |
2. Entity Handling:
|
218 |
+
- Identify and preserve main entities from context
|
219 |
+
- Enclose ONLY entity names in double quotes
|
220 |
+
- Example: "Deutsche Bank" profits, not "Deutsche Bank profits"
|
|
|
221 |
|
222 |
3. Date and Time Context:
|
223 |
+
- For queries about current/latest information:
|
224 |
+
* Keep time-related words (latest, current, recent, now)
|
225 |
+
* ALWAYS append "after: YYYY" (current year)
|
226 |
+
- For specific time periods:
|
227 |
+
* Preserve the original time reference
|
228 |
+
* Add appropriate "after: YYYY" based on context
|
229 |
+
- For queries without time reference:
|
230 |
+
* Add "after: YYYY" if about current state/status
|
231 |
|
232 |
4. Query Formatting:
|
233 |
+
- Capitalize first letter
|
234 |
+
- No period at end
|
235 |
+
- Include all relevant context
|
236 |
+
- Maintain clear and searchable structure
|
237 |
+
|
238 |
+
Remember: Your goal is to create a complete, self-contained query that includes all necessary context from the conversation history."""
|
239 |
|
240 |
messages = [
|
241 |
{"role": "system", "content": system_prompt},
|
242 |
{"role": "user", "content": f"""Current year: {current_year}
|
243 |
Recent conversation history:
|
244 |
{chat_context}
|
245 |
+
Current query: {query}
|
246 |
+
|
247 |
Please rephrase this query into a complete, contextual search query following the rules above. The rephrased query should be clear and complete even without the conversation context."""}
|
248 |
]
|
249 |
|
|
|
254 |
max_tokens=200,
|
255 |
stream=False
|
256 |
)
|
257 |
+
|
258 |
rephrased_query = response.choices[0].message.content.strip()
|
|
|
|
|
|
|
|
|
|
|
259 |
logger.info(f'Query rephrased to: {rephrased_query}')
|
260 |
return rephrased_query
|
261 |
|
262 |
except Exception as e:
|
263 |
logger.error(f'Error rephrasing query: {e}')
|
264 |
+
|
265 |
# If rephrasing fails, construct a basic contextual query
|
266 |
try:
|
267 |
last_query = chat_history[-1][0] if chat_history else ""
|
|
|
582 |
formatted_history.append(f"Assistant: {assistant_msg}")
|
583 |
return "\n".join(formatted_history)
|
584 |
|
585 |
+
|
586 |
async def get_search_results(self,
|
587 |
query: str,
|
588 |
history: List[List[str]],
|
|
|
704 |
return f"Error occurred: {str(e)}"
|
705 |
|
706 |
async def get_response(self,
|
707 |
+
query: str,
|
708 |
+
history: List[List[str]],
|
709 |
+
num_results: int,
|
710 |
+
max_chars: int,
|
711 |
+
score_threshold: float,
|
712 |
+
temperature: float,
|
713 |
+
scoring_method: str,
|
714 |
+
selected_engines: List[str],
|
715 |
+
safe_search: str,
|
716 |
+
language: str,
|
717 |
+
force_web_search: bool = False) -> str:
|
718 |
+
"""Enhanced get_response method with URL scraping capability."""
|
719 |
logger.info(f'Processing query: {query}')
|
720 |
try:
|
721 |
+
# Extract URLs from the query
|
722 |
+
urls = extract_urls(query)
|
723 |
+
|
724 |
+
# If valid URLs are found in the query, directly scrape them
|
725 |
+
if urls:
|
726 |
+
logger.info(f'Found URLs in query: {urls}')
|
727 |
+
articles = await self.scrape_specific_urls(urls, max_chars)
|
728 |
+
|
729 |
+
if not articles:
|
730 |
+
return "I couldn't extract valid content from the provided URLs. Please check if the URLs are accessible."
|
731 |
+
|
732 |
+
# Generate summary using only the scraped content
|
733 |
+
summary = await generate_summary(query, articles, temperature)
|
734 |
+
|
735 |
+
# Format response
|
736 |
+
response = "**Direct URL Scraping Results:**\n\n"
|
737 |
+
response += summary + "\n\n"
|
738 |
+
response += "**Scraped URLs:**\n"
|
739 |
+
for i, article in enumerate(articles, 1):
|
740 |
+
response += f"{i}. [{urlparse(article['url']).netloc}]({article['url']})\n"
|
741 |
+
|
742 |
+
return response
|
743 |
+
|
744 |
+
# If no URLs found, proceed with regular query processing
|
745 |
formatted_history = self.format_chat_history(history)
|
|
|
|
|
|
|
746 |
force_web_search = force_web_search == "Web Search Only"
|
747 |
+
|
|
|
|
|
748 |
if force_web_search:
|
|
|
749 |
query_type = QueryType.WEB_SEARCH
|
750 |
else:
|
|
|
751 |
query_type = await determine_query_type(query, history, temperature)
|
|
|
752 |
|
753 |
if query_type == QueryType.KNOWLEDGE_BASE and not force_web_search:
|
|
|
754 |
response = await process_knowledge_base_query(
|
755 |
query=query,
|
756 |
chat_history=history,
|
757 |
temperature=temperature
|
758 |
)
|
759 |
else:
|
|
|
760 |
response = await self.get_search_results(
|
761 |
query=query,
|
762 |
history=history,
|
|
|
769 |
safe_search=safe_search,
|
770 |
language=language
|
771 |
)
|
772 |
+
|
|
|
773 |
return response
|
774 |
+
|
775 |
except Exception as e:
|
776 |
logger.error(f'Error in get_response: {e}')
|
777 |
return f"I apologize, but I encountered an error: {str(e)}"
|