Spaces:

Shreyas094
/

SearXNG-AI-v2

Running

App Files Files Community

Shreyas094 commited on Nov 13, 2024

Commit

4ea7d03

verified ·

1 Parent(s): 986478e

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -70

app.py CHANGED Viewed

@@ -20,6 +20,8 @@ from typing import List, Dict, Any, Set, Optional
 from dotenv import load_dotenv
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 # Configure logging
 logging.basicConfig(
@@ -127,6 +129,19 @@ Classify as "knowledge_base" if the query:
         logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
         return QueryType.KNOWLEDGE_BASE
 async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
     """Handle queries that can be answered from the knowledge base, with context."""
     logger.info(f'Processing knowledge base query: {query}')
@@ -174,23 +189,10 @@ Guidelines:
         return f"I apologize, but I encountered an error while processing your query: {str(e)}"
 async def rephrase_query(chat_history, query, temperature=0.2) -> str:
-    """Rephrase the query based on chat history and context while preserving URLs."""
     logger.info(f'Rephrasing query: {query}')
     try:
-        # Extract URLs from the query
-        url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
-        urls = re.findall(url_pattern, query)
-        # If URLs are found, store them and replace with placeholders
-        url_placeholders = {}
-        modified_query = query
-        if urls:
-            for idx, url in enumerate(urls):
-                placeholder = f"__URL_{idx}__"
-                url_placeholders[placeholder] = url
-                modified_query = modified_query.replace(url, placeholder)
         # Format recent conversation history (last 3 turns for context)
         formatted_history = []
         for i, (user_msg, assistant_msg) in enumerate(chat_history[-3:], 1):
@@ -206,41 +208,42 @@ async def rephrase_query(chat_history, query, temperature=0.2) -> str:
 Key Rules:
 1. For follow-up questions or queries referencing previous conversation:
- - Extract the main topic/subject from previous messages
- - Combine previous context with the current query
- - Example: Previous: "What is the structure of German banking industry?"
-   Current: "can you do more latest web search on my previous query"
-   Should become: "Latest structure and developments in German banking industry after: 2024"
 2. Entity Handling:
- - Identify and preserve main entities from context
- - Enclose ONLY entity names in double quotes
- - Example: "Deutsche Bank" profits, not "Deutsche Bank profits"
- - Preserve URL placeholders exactly as they appear (marked with __URL_N__)
 3. Date and Time Context:
- - For queries about current/latest information:
-   * Keep time-related words (latest, current, recent, now)
-   * ALWAYS append "after: YYYY" (current year)
- - For specific time periods:
-   * Preserve the original time reference
-   * Add appropriate "after: YYYY" based on context
- - For queries without time reference:
-   * Add "after: YYYY" if about current state/status
 4. Query Formatting:
- - Capitalize first letter
- - No period at end
- - Include all relevant context
- - Maintain clear and searchable structure
- - IMPORTANT: Keep URL placeholders (__URL_N__) exactly as they appear"""
         messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": f"""Current year: {current_year}
 Recent conversation history:
 {chat_context}
-Current query: {modified_query}
 Please rephrase this query into a complete, contextual search query following the rules above. The rephrased query should be clear and complete even without the conversation context."""}
         ]
@@ -251,18 +254,14 @@ Please rephrase this query into a complete, contextual search query following th
             max_tokens=200,
             stream=False
         )
         rephrased_query = response.choices[0].message.content.strip()
-        # Replace placeholders with original URLs
-        for placeholder, url in url_placeholders.items():
-            rephrased_query = rephrased_query.replace(placeholder, url)
         logger.info(f'Query rephrased to: {rephrased_query}')
         return rephrased_query
     except Exception as e:
         logger.error(f'Error rephrasing query: {e}')
         # If rephrasing fails, construct a basic contextual query
         try:
             last_query = chat_history[-1][0] if chat_history else ""
@@ -583,6 +582,7 @@ class ChatBot:
                 formatted_history.append(f"Assistant: {assistant_msg}")
         return "\n".join(formatted_history)
     async def get_search_results(self,
                                query: str,
                                history: List[List[str]],
@@ -704,46 +704,59 @@ class ChatBot:
             return f"Error occurred: {str(e)}"
     async def get_response(self,
-                      query: str,
-                      history: List[List[str]],
-                      num_results: int,
-                      max_chars: int,
-                      score_threshold: float,
-                      temperature: float,
-                      scoring_method: str,
-                      selected_engines: List[str],
-                      safe_search: str,
-                      language: str,
-                      force_web_search: bool = False) -> str:
-        """Determine query type and route to appropriate handler with context."""
         logger.info(f'Processing query: {query}')
         try:
-            # Update conversation history
             formatted_history = self.format_chat_history(history)
-            logger.info(f'Current conversation context:\n{formatted_history}')
-            # Convert the force_web_search radio button value to boolean
             force_web_search = force_web_search == "Web Search Only"
-            logger.info(f'Force web search mode: {force_web_search}')
-            # If force_web_search is True, skip query type determination
             if force_web_search:
-                logger.info('Force web search mode enabled - bypassing query type determination')
                 query_type = QueryType.WEB_SEARCH
             else:
-                # Determine query type with context
                 query_type = await determine_query_type(query, history, temperature)
-                logger.info(f'Query type determined as: {query_type}')
             if query_type == QueryType.KNOWLEDGE_BASE and not force_web_search:
-                logger.info('Using knowledge base to answer query')
                 response = await process_knowledge_base_query(
                     query=query,
                     chat_history=history,
                     temperature=temperature
                 )
             else:
-                logger.info('Using web search to answer query')
                 response = await self.get_search_results(
                     query=query,
                     history=history,
@@ -756,10 +769,9 @@ class ChatBot:
                     safe_search=safe_search,
                     language=language
                 )
-            logger.info(f'Generated response type: {query_type}')
             return response
         except Exception as e:
             logger.error(f'Error in get_response: {e}')
             return f"I apologize, but I encountered an error: {str(e)}"

 from dotenv import load_dotenv
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
+import re
+from urllib.parse import urlparse
 # Configure logging
 logging.basicConfig(
         logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
         return QueryType.KNOWLEDGE_BASE
+def is_valid_url(url: str) -> bool:
+    """Check if the provided string is a valid URL."""
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+def extract_urls(text: str) -> List[str]:
+    """Extract URLs from text using regex pattern."""
+    url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
+    return re.findall(url_pattern, text)
 async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
     """Handle queries that can be answered from the knowledge base, with context."""
     logger.info(f'Processing knowledge base query: {query}')
         return f"I apologize, but I encountered an error while processing your query: {str(e)}"
 async def rephrase_query(chat_history, query, temperature=0.2) -> str:
+    """Rephrase the query based on chat history and context."""
     logger.info(f'Rephrasing query: {query}')
     try:
         # Format recent conversation history (last 3 turns for context)
         formatted_history = []
         for i, (user_msg, assistant_msg) in enumerate(chat_history[-3:], 1):
 Key Rules:
 1. For follow-up questions or queries referencing previous conversation:
+    - Extract the main topic/subject from previous messages
+    - Combine previous context with the current query
+    - Example: Previous: "What is the structure of German banking industry?"
+      Current: "can you do more latest web search on my previous query"
+      Should become: "Latest structure and developments in German banking industry after: 2024"
 2. Entity Handling:
+    - Identify and preserve main entities from context
+    - Enclose ONLY entity names in double quotes
+    - Example: "Deutsche Bank" profits, not "Deutsche Bank profits"
 3. Date and Time Context:
+    - For queries about current/latest information:
+        * Keep time-related words (latest, current, recent, now)
+        * ALWAYS append "after: YYYY" (current year)
+    - For specific time periods:
+        * Preserve the original time reference
+        * Add appropriate "after: YYYY" based on context
+    - For queries without time reference:
+        * Add "after: YYYY" if about current state/status
 4. Query Formatting:
+    - Capitalize first letter
+    - No period at end
+    - Include all relevant context
+    - Maintain clear and searchable structure
+Remember: Your goal is to create a complete, self-contained query that includes all necessary context from the conversation history."""
         messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": f"""Current year: {current_year}
 Recent conversation history:
 {chat_context}
+Current query: {query}
 Please rephrase this query into a complete, contextual search query following the rules above. The rephrased query should be clear and complete even without the conversation context."""}
         ]
             max_tokens=200,
             stream=False
         )
         rephrased_query = response.choices[0].message.content.strip()
         logger.info(f'Query rephrased to: {rephrased_query}')
         return rephrased_query
     except Exception as e:
         logger.error(f'Error rephrasing query: {e}')
         # If rephrasing fails, construct a basic contextual query
         try:
             last_query = chat_history[-1][0] if chat_history else ""
                 formatted_history.append(f"Assistant: {assistant_msg}")
         return "\n".join(formatted_history)
     async def get_search_results(self,
                                query: str,
                                history: List[List[str]],
             return f"Error occurred: {str(e)}"
     async def get_response(self,
+                          query: str,
+                          history: List[List[str]],
+                          num_results: int,
+                          max_chars: int,
+                          score_threshold: float,
+                          temperature: float,
+                          scoring_method: str,
+                          selected_engines: List[str],
+                          safe_search: str,
+                          language: str,
+                          force_web_search: bool = False) -> str:
+        """Enhanced get_response method with URL scraping capability."""
         logger.info(f'Processing query: {query}')
         try:
+            # Extract URLs from the query
+            urls = extract_urls(query)
+            # If valid URLs are found in the query, directly scrape them
+            if urls:
+                logger.info(f'Found URLs in query: {urls}')
+                articles = await self.scrape_specific_urls(urls, max_chars)
+                if not articles:
+                    return "I couldn't extract valid content from the provided URLs. Please check if the URLs are accessible."
+                # Generate summary using only the scraped content
+                summary = await generate_summary(query, articles, temperature)
+                # Format response
+                response = "**Direct URL Scraping Results:**\n\n"
+                response += summary + "\n\n"
+                response += "**Scraped URLs:**\n"
+                for i, article in enumerate(articles, 1):
+                    response += f"{i}. [{urlparse(article['url']).netloc}]({article['url']})\n"
+                return response
+            # If no URLs found, proceed with regular query processing
             formatted_history = self.format_chat_history(history)
             force_web_search = force_web_search == "Web Search Only"
             if force_web_search:
                 query_type = QueryType.WEB_SEARCH
             else:
                 query_type = await determine_query_type(query, history, temperature)
             if query_type == QueryType.KNOWLEDGE_BASE and not force_web_search:
                 response = await process_knowledge_base_query(
                     query=query,
                     chat_history=history,
                     temperature=temperature
                 )
             else:
                 response = await self.get_search_results(
                     query=query,
                     history=history,
                     safe_search=safe_search,
                     language=language
                 )
             return response
         except Exception as e:
             logger.error(f'Error in get_response: {e}')
             return f"I apologize, but I encountered an error: {str(e)}"