Spaces:

Alteredverse
/

WebLLM-Search-Assistant

Sleeping

App Files Files Community

minar09 commited on Feb 4

Commit

0a69927

verified ·

1 Parent(s): 92ecfe5

Upload 5 files

Browse files

Files changed (5) hide show

Self_Improving_Search.py +431 -0
llm_config.py +39 -0
llm_response_parser.py +177 -0
llm_wrapper.py +69 -0
web_scraper.py +149 -0

Self_Improving_Search.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import time
+import re
+import os
+from typing import List, Dict, Tuple, Union
+from colorama import Fore, Style
+import logging
+import sys
+from io import StringIO
+from web_scraper import get_web_content, can_fetch
+from llm_config import get_llm_config
+from llm_response_parser import UltimateLLMResponseParser
+from llm_wrapper import LLMWrapper
+from urllib.parse import urlparse
+# Set up logging
+log_directory = 'logs'
+if not os.path.exists(log_directory):
+    os.makedirs(log_directory)
+# Configure logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+log_file = os.path.join(log_directory, 'llama_output.log')
+file_handler = logging.FileHandler(log_file)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+file_handler.setFormatter(formatter)
+logger.handlers = []
+logger.addHandler(file_handler)
+logger.propagate = False
+# Suppress other loggers
+for name in ['root', 'duckduckgo_search', 'requests', 'urllib3']:
+    logging.getLogger(name).setLevel(logging.WARNING)
+    logging.getLogger(name).handlers = []
+    logging.getLogger(name).propagate = False
+class OutputRedirector:
+    def __init__(self, stream=None):
+        self.stream = stream or StringIO()
+        self.original_stdout = sys.stdout
+        self.original_stderr = sys.stderr
+    def __enter__(self):
+        sys.stdout = self.stream
+        sys.stderr = self.stream
+        return self.stream
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stdout = self.original_stdout
+        sys.stderr = self.original_stderr
+class EnhancedSelfImprovingSearch:
+    def __init__(self, llm: LLMWrapper, parser: UltimateLLMResponseParser, max_attempts: int = 5):
+        self.llm = llm
+        self.parser = parser
+        self.max_attempts = max_attempts
+        self.llm_config = get_llm_config()
+    @staticmethod
+    def initialize_llm():
+        llm_wrapper = LLMWrapper()
+        return llm_wrapper
+    def print_thinking(self):
+        print(Fore.MAGENTA + "🧠 Thinking..." + Style.RESET_ALL)
+    def print_searching(self):
+        print(Fore.MAGENTA + "📝 Searching..." + Style.RESET_ALL)
+    def search_and_improve(self, user_query: str) -> str:
+        attempt = 0
+        while attempt < self.max_attempts:
+            print(f"\n{Fore.CYAN}Search attempt {attempt + 1}:{Style.RESET_ALL}")
+            self.print_searching()
+            try:
+                formulated_query, time_range = self.formulate_query(user_query, attempt)
+                print(f"{Fore.YELLOW}Original query: {user_query}{Style.RESET_ALL}")
+                print(f"{Fore.YELLOW}Formulated query: {formulated_query}{Style.RESET_ALL}")
+                print(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}")
+                if not formulated_query:
+                    print(f"{Fore.RED}Error: Empty search query. Retrying...{Style.RESET_ALL}")
+                    attempt += 1
+                    continue
+                search_results = self.perform_search(formulated_query, time_range)
+                if not search_results:
+                    print(f"{Fore.RED}No results found. Retrying with a different query...{Style.RESET_ALL}")
+                    attempt += 1
+                    continue
+                self.display_search_results(search_results)
+                selected_urls = self.select_relevant_pages(search_results, user_query)
+                if not selected_urls:
+                    print(f"{Fore.RED}No relevant URLs found. Retrying...{Style.RESET_ALL}")
+                    attempt += 1
+                    continue
+                print(Fore.MAGENTA + "⚙️ Scraping selected pages..." + Style.RESET_ALL)
+                # Scraping is done without OutputRedirector to ensure messages are visible
+                scraped_content = self.scrape_content(selected_urls)
+                if not scraped_content:
+                    print(f"{Fore.RED}Failed to scrape content. Retrying...{Style.RESET_ALL}")
+                    attempt += 1
+                    continue
+                self.display_scraped_content(scraped_content)
+                self.print_thinking()
+                with OutputRedirector() as output:
+                    evaluation, decision = self.evaluate_scraped_content(user_query, scraped_content)
+                llm_output = output.getvalue()
+                logger.info(f"LLM Output in evaluate_scraped_content:\n{llm_output}")
+                print(f"{Fore.MAGENTA}Evaluation: {evaluation}{Style.RESET_ALL}")
+                print(f"{Fore.MAGENTA}Decision: {decision}{Style.RESET_ALL}")
+                if decision == "answer":
+                    return self.generate_final_answer(user_query, scraped_content)
+                elif decision == "refine":
+                    print(f"{Fore.YELLOW}Refining search...{Style.RESET_ALL}")
+                    attempt += 1
+                else:
+                    print(f"{Fore.RED}Unexpected decision. Proceeding to answer.{Style.RESET_ALL}")
+                    return self.generate_final_answer(user_query, scraped_content)
+            except Exception as e:
+                print(f"{Fore.RED}An error occurred during search attempt. Check the log file for details.{Style.RESET_ALL}")
+                logger.error(f"An error occurred during search: {str(e)}", exc_info=True)
+                attempt += 1
+        return self.synthesize_final_answer(user_query)
+    def evaluate_scraped_content(self, user_query: str, scraped_content: Dict[str, str]) -> Tuple[str, str]:
+        user_query_short = user_query[:200]
+        prompt = f"""
+Evaluate if the following scraped content contains sufficient information to answer the user's question comprehensively:
+User's question: "{user_query_short}"
+Scraped Content:
+{self.format_scraped_content(scraped_content)}
+Your task:
+1. Determine if the scraped content provides enough relevant and detailed information to answer the user's question thoroughly.
+2. If the information is sufficient, decide to 'answer'. If more information or clarification is needed, decide to 'refine' the search.
+Respond using EXACTLY this format:
+Evaluation: [Your evaluation of the scraped content]
+Decision: [ONLY 'answer' if content is sufficient, or 'refine' if more information is needed]
+"""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
+                evaluation, decision = self.parse_evaluation_response(response_text)
+                if decision in ['answer', 'refine']:
+                    return evaluation, decision
+            except Exception as e:
+                logger.warning(f"Error in evaluate_scraped_content (attempt {attempt + 1}): {str(e)}")
+        logger.warning("Failed to get a valid decision in evaluate_scraped_content. Defaulting to 'refine'.")
+        return "Failed to evaluate content.", "refine"
+    def parse_evaluation_response(self, response: str) -> Tuple[str, str]:
+        evaluation = ""
+        decision = ""
+        for line in response.strip().split('\n'):
+            if line.startswith('Evaluation:'):
+                evaluation = line.split(':', 1)[1].strip()
+            elif line.startswith('Decision:'):
+                decision = line.split(':', 1)[1].strip().lower()
+        return evaluation, decision
+    def formulate_query(self, user_query: str, attempt: int) -> Tuple[str, str]:
+        user_query_short = user_query[:200]
+        prompt = f"""
+Based on the following user question, formulate a concise and effective search query:
+"{user_query_short}"
+Your task:
+1. Create a search query of 2-5 words that will yield relevant results.
+2. Determine if a specific time range is needed for the search.
+Time range options:
+- 'd': Limit results to the past day. Use for very recent events or rapidly changing information.
+- 'w': Limit results to the past week. Use for recent events or topics with frequent updates.
+- 'm': Limit results to the past month. Use for relatively recent information or ongoing events.
+- 'y': Limit results to the past year. Use for annual events or information that changes yearly.
+- 'none': No time limit. Use for historical information or topics not tied to a specific time frame.
+Respond in the following format:
+Search query: [Your 2-5 word query]
+Time range: [d/w/m/y/none]
+Do not provide any additional information or explanation.
+"""
+        max_retries = 3
+        for retry in range(max_retries):
+            with OutputRedirector() as output:
+                response_text = self.llm.generate(prompt, max_tokens=50, stop=None)
+            llm_output = output.getvalue()
+            logger.info(f"LLM Output in formulate_query:\n{llm_output}")
+            query, time_range = self.parse_query_response(response_text)
+            if query and time_range:
+                return query, time_range
+        return self.fallback_query(user_query), "none"
+    def parse_query_response(self, response: str) -> Tuple[str, str]:
+        query = ""
+        time_range = "none"
+        for line in response.strip().split('\n'):
+            if ":" in line:
+                key, value = line.split(":", 1)
+                key = key.strip().lower()
+                value = value.strip()
+                if "query" in key:
+                    query = self.clean_query(value)
+                elif "time" in key or "range" in key:
+                    time_range = self.validate_time_range(value)
+        return query, time_range
+    def clean_query(self, query: str) -> str:
+        query = re.sub(r'["\'\[\]]', '', query)
+        query = re.sub(r'\s+', ' ', query)
+        return query.strip()[:100]
+    def validate_time_range(self, time_range: str) -> str:
+        valid_ranges = ['d', 'w', 'm', 'y', 'none']
+        time_range = time_range.lower()
+        return time_range if time_range in valid_ranges else 'none'
+    def fallback_query(self, user_query: str) -> str:
+        words = user_query.split()
+        return " ".join(words[:5])
+    def perform_search(self, query: str, time_range: str) -> List[Dict]:
+        if not query:
+            return []
+        from duckduckgo_search import DDGS
+        with DDGS() as ddgs:
+            try:
+                with OutputRedirector() as output:
+                    if time_range and time_range != 'none':
+                        results = list(ddgs.text(query, timelimit=time_range, max_results=10))
+                    else:
+                        results = list(ddgs.text(query, max_results=10))
+                ddg_output = output.getvalue()
+                logger.info(f"DDG Output in perform_search:\n{ddg_output}")
+                print(f"{Fore.GREEN}Search query sent to DuckDuckGo: {query}{Style.RESET_ALL}")
+                print(f"{Fore.GREEN}Time range sent to DuckDuckGo: {time_range}{Style.RESET_ALL}")
+                print(f"{Fore.GREEN}Number of results: {len(results)}{Style.RESET_ALL}")
+                return [{'number': i+1, **result} for i, result in enumerate(results)]
+            except Exception as e:
+                print(f"{Fore.RED}Search error: {str(e)}{Style.RESET_ALL}")
+                return []
+    def display_search_results(self, results: List[Dict]):
+        print(f"\n{Fore.CYAN}Search Results:{Style.RESET_ALL}")
+        for result in results:
+            print(f"{Fore.GREEN}Result {result['number']}:{Style.RESET_ALL}")
+            print(f"Title: {result.get('title', 'N/A')}")
+            print(f"Snippet: {result.get('body', 'N/A')[:200]}...")
+            print(f"URL: {result.get('href', 'N/A')}\n")
+    def select_relevant_pages(self, search_results: List[Dict], user_query: str) -> List[str]:
+        prompt = f"""
+Given the following search results for the user's question: "{user_query}"
+Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection.
+Search Results:
+{self.format_results(search_results)}
+Instructions:
+1. You MUST select exactly 2 result numbers from the search results.
+2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question.
+3. Provide a brief reason for each selection.
+You MUST respond using EXACTLY this format and nothing else:
+Selected Results: [Two numbers corresponding to the selected results]
+Reasoning: [Your reasoning for the selections]
+"""
+        max_retries = 3
+        for retry in range(max_retries):
+            with OutputRedirector() as output:
+                response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
+            llm_output = output.getvalue()
+            logger.info(f"LLM Output in select_relevant_pages:\n{llm_output}")
+            parsed_response = self.parse_page_selection_response(response_text)
+            if parsed_response and self.validate_page_selection_response(parsed_response, len(search_results)):
+                selected_urls = [result['href'] for result in search_results if result['number'] in parsed_response['selected_results']]
+                allowed_urls = [url for url in selected_urls if can_fetch(url)]
+                if allowed_urls:
+                    return allowed_urls
+                else:
+                    print(f"{Fore.YELLOW}Warning: All selected URLs are disallowed by robots.txt. Retrying selection.{Style.RESET_ALL}")
+            else:
+                print(f"{Fore.YELLOW}Warning: Invalid page selection. Retrying.{Style.RESET_ALL}")
+        print(f"{Fore.YELLOW}Warning: All attempts to select relevant pages failed. Falling back to top allowed results.{Style.RESET_ALL}")
+        allowed_urls = [result['href'] for result in search_results if can_fetch(result['href'])][:2]
+        return allowed_urls
+    def parse_page_selection_response(self, response: str) -> Dict[str, Union[List[int], str]]:
+        lines = response.strip().split('\n')
+        parsed = {}
+        for line in lines:
+            if line.startswith('Selected Results:'):
+                parsed['selected_results'] = [int(num.strip()) for num in re.findall(r'\d+', line)]
+            elif line.startswith('Reasoning:'):
+                parsed['reasoning'] = line.split(':', 1)[1].strip()
+        return parsed if 'selected_results' in parsed and 'reasoning' in parsed else None
+    def validate_page_selection_response(self, parsed_response: Dict[str, Union[List[int], str]], num_results: int) -> bool:
+        if len(parsed_response['selected_results']) != 2:
+            return False
+        if any(num < 1 or num > num_results for num in parsed_response['selected_results']):
+            return False
+        return True
+    def format_results(self, results: List[Dict]) -> str:
+        formatted_results = []
+        for result in results:
+            formatted_result = f"{result['number']}. Title: {result.get('title', 'N/A')}\n"
+            formatted_result += f"   Snippet: {result.get('body', 'N/A')[:200]}...\n"
+            formatted_result += f"   URL: {result.get('href', 'N/A')}\n"
+            formatted_results.append(formatted_result)
+        return "\n".join(formatted_results)
+    def scrape_content(self, urls: List[str]) -> Dict[str, str]:
+        scraped_content = {}
+        blocked_urls = []
+        for url in urls:
+            robots_allowed = can_fetch(url)
+            if robots_allowed:
+                content = get_web_content([url])
+                if content:
+                    scraped_content.update(content)
+                    print(Fore.YELLOW + f"Successfully scraped: {url}" + Style.RESET_ALL)
+                    logger.info(f"Successfully scraped: {url}")
+                else:
+                    print(Fore.RED + f"Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
+                    logger.warning(f"Robots.txt disallows scraping of {url}")
+            else:
+                blocked_urls.append(url)
+                print(Fore.RED + f"Warning: Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
+                logger.warning(f"Robots.txt disallows scraping of {url}")
+        print(Fore.CYAN + f"Scraped content received for {len(scraped_content)} URLs" + Style.RESET_ALL)
+        logger.info(f"Scraped content received for {len(scraped_content)} URLs")
+        if blocked_urls:
+            print(Fore.RED + f"Warning: {len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions." + Style.RESET_ALL)
+            logger.warning(f"{len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions: {', '.join(blocked_urls)}")
+        return scraped_content
+    def display_scraped_content(self, scraped_content: Dict[str, str]):
+        print(f"\n{Fore.CYAN}Scraped Content:{Style.RESET_ALL}")
+        for url, content in scraped_content.items():
+            print(f"{Fore.GREEN}URL: {url}{Style.RESET_ALL}")
+            print(f"Content: {content[:4000]}...\n")
+    def generate_final_answer(self, user_query: str, scraped_content: Dict[str, str]) -> str:
+        user_query_short = user_query[:200]
+        prompt = f"""
+You are an AI assistant. Provide a comprehensive and detailed answer to the following question using ONLY the information provided in the scraped content. Do not include any references or mention any sources. Answer directly and thoroughly.
+Question: "{user_query_short}"
+Scraped Content:
+{self.format_scraped_content(scraped_content)}
+Important Instructions:
+1. Do not use phrases like "Based on the absence of selected results" or similar.
+2. If the scraped content does not contain enough information to answer the question, say so explicitly and explain what information is missing.
+3. Provide as much relevant detail as possible from the scraped content.
+Answer:
+"""
+        max_retries = 3
+        for attempt in range(max_retries):
+            with OutputRedirector() as output:
+                response_text = self.llm.generate(prompt, max_tokens=1024, stop=None)
+            llm_output = output.getvalue()
+            logger.info(f"LLM Output in generate_final_answer:\n{llm_output}")
+            if response_text:
+                logger.info(f"LLM Response:\n{response_text}")
+                return response_text
+        error_message = "I apologize, but I couldn't generate a satisfactory answer based on the available information."
+        logger.warning(f"Failed to generate a response after {max_retries} attempts. Returning error message.")
+        return error_message
+    def format_scraped_content(self, scraped_content: Dict[str, str]) -> str:
+        formatted_content = []
+        for url, content in scraped_content.items():
+            content = re.sub(r'\s+', ' ', content)
+            formatted_content.append(f"Content from {url}:\n{content}\n")
+        return "\n".join(formatted_content)
+    def synthesize_final_answer(self, user_query: str) -> str:
+        prompt = f"""
+After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "{user_query}"
+Please provide the best possible answer you can, acknowledging any limitations or uncertainties.
+If appropriate, suggest ways the user might refine their question or where they might find more information.
+Respond in a clear, concise, and informative manner.
+"""
+        try:
+            with OutputRedirector() as output:
+                response_text = self.llm.generate(prompt, max_tokens=self.llm_config.get('max_tokens', 1024), stop=self.llm_config.get('stop', None))
+            llm_output = output.getvalue()
+            logger.info(f"LLM Output in synthesize_final_answer:\n{llm_output}")
+            if response_text:
+                return response_text.strip()
+        except Exception as e:
+            logger.error(f"Error in synthesize_final_answer: {str(e)}", exc_info=True)
+        return "I apologize, but after multiple attempts, I wasn't able to find a satisfactory answer to your question. Please try rephrasing your question or breaking it down into smaller, more specific queries."
+# End of EnhancedSelfImprovingSearch class

llm_config.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# llm_config.py
+LLM_TYPE = "llama_cpp"  # Options: 'llama_cpp', 'ollama'
+# LLM settings for llama_cpp
+MODEL_PATH = None  # "/filepath/to/your/llama.cpp/model" # Replace with your llama.cpp models filepath
+LLM_CONFIG_LLAMA_CPP = {
+    "llm_type": "llama_cpp",
+    "model_path": MODEL_PATH,
+    "n_ctx": 20000,  # context size
+    "n_gpu_layers": 0,  # number of layers to offload to GPU (-1 for all, 0 for none)
+    "n_threads": 8,  # number of threads to use
+    "temperature": 0.7,  # temperature for sampling
+    "top_p": 0.9,  # top p for sampling
+    "top_k": 40,  # top k for sampling
+    "repeat_penalty": 1.1,  # repeat penalty
+    "max_tokens": 1024,  # max tokens to generate
+    "stop": ["User:", "\n\n"]  # stop sequences
+}
+# LLM settings for Ollama
+LLM_CONFIG_OLLAMA = {
+    "llm_type": "ollama",
+    "base_url": "http://localhost:11434",  # default Ollama server URL
+    "model_name": "ollama model name",  # Replace with your Ollama model name
+    "temperature": 0.7,
+    "top_p": 0.9,
+    "n_ctx": 20000,  # context size
+    "stop": ["User:", "\n\n"]
+}
+def get_llm_config():
+    if LLM_TYPE == "llama_cpp":
+        return LLM_CONFIG_LLAMA_CPP
+    elif LLM_TYPE == "ollama":
+        return LLM_CONFIG_OLLAMA
+    else:
+        raise ValueError(f"Invalid LLM_TYPE: {LLM_TYPE}")

llm_response_parser.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import re
+from typing import Dict, List, Union
+import logging
+import json
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class UltimateLLMResponseParser:
+    def __init__(self):
+        self.decision_keywords = {
+            'refine': ['refine', 'need more info', 'insufficient', 'unclear', 'more research', 'additional search'],
+            'answer': ['answer', 'sufficient', 'enough info', 'can respond', 'adequate', 'comprehensive']
+        }
+        self.section_identifiers = [
+            ('decision', r'(?i)decision\s*:'),
+            ('reasoning', r'(?i)reasoning\s*:'),
+            ('selected_results', r'(?i)selected results\s*:'),
+            ('response', r'(?i)response\s*:')
+        ]
+    def parse_llm_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
+        logger.info("Starting to parse LLM response")
+        # Initialize result dictionary
+        result = {
+            'decision': None,
+            'reasoning': None,
+            'selected_results': [],
+            'response': None
+        }
+        # Define parsing strategies
+        parsing_strategies = [
+            self._parse_structured_response,
+            self._parse_json_response,
+            self._parse_unstructured_response,
+            self._parse_implicit_response
+        ]
+        # Try each parsing strategy
+        for strategy in parsing_strategies:
+            try:
+                parsed_result = strategy(response)
+                if self._is_valid_result(parsed_result):
+                    result.update(parsed_result)
+                    logger.info(f"Successfully parsed using strategy: {strategy.__name__}")
+                    break
+            except Exception as e:
+                logger.warning(f"Error in parsing strategy {strategy.__name__}: {str(e)}")
+        # If no strategy succeeded, use fallback parsing
+        if not self._is_valid_result(result):
+            logger.warning("All parsing strategies failed. Using fallback parsing.")
+            result = self._fallback_parsing(response)
+        # Post-process the result
+        result = self._post_process_result(result)
+        logger.info("Finished parsing LLM response")
+        return result
+    def _parse_structured_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
+        result = {}
+        for key, pattern in self.section_identifiers:
+            match = re.search(f'{pattern}(.*?)(?={"|".join([p for k, p in self.section_identifiers if k != key])}|$)', response, re.IGNORECASE | re.DOTALL)
+            if match:
+                result[key] = match.group(1).strip()
+        if 'selected_results' in result:
+            result['selected_results'] = self._extract_numbers(result['selected_results'])
+        return result
+    def _parse_json_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
+        try:
+            json_match = re.search(r'\{.*\}', response, re.DOTALL)
+            if json_match:
+                json_str = json_match.group(0)
+                parsed_json = json.loads(json_str)
+                return {k: v for k, v in parsed_json.items() if k in ['decision', 'reasoning', 'selected_results', 'response']}
+        except json.JSONDecodeError:
+            pass
+        return {}
+    def _parse_unstructured_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
+        result = {}
+        lines = response.split('\n')
+        current_section = None
+        for line in lines:
+            section_match = re.match(r'(.+?)[:.-](.+)', line)
+            if section_match:
+                key = self._match_section_to_key(section_match.group(1))
+                if key:
+                    current_section = key
+                    result[key] = section_match.group(2).strip()
+            elif current_section:
+                result[current_section] += ' ' + line.strip()
+        if 'selected_results' in result:
+            result['selected_results'] = self._extract_numbers(result['selected_results'])
+        return result
+    def _parse_implicit_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
+        result = {}
+        decision = self._infer_decision(response)
+        if decision:
+            result['decision'] = decision
+        numbers = self._extract_numbers(response)
+        if numbers:
+            result['selected_results'] = numbers
+        if not result:
+            result['response'] = response.strip()
+        return result
+    def _fallback_parsing(self, response: str) -> Dict[str, Union[str, List[int]]]:
+        result = {
+            'decision': self._infer_decision(response),
+            'reasoning': None,
+            'selected_results': self._extract_numbers(response),
+            'response': response.strip()
+        }
+        return result
+    def _post_process_result(self, result: Dict[str, Union[str, List[int]]]) -> Dict[str, Union[str, List[int]]]:
+        if result['decision'] not in ['refine', 'answer']:
+            result['decision'] = self._infer_decision(str(result))
+        if not isinstance(result['selected_results'], list):
+            result['selected_results'] = self._extract_numbers(str(result['selected_results']))
+        result['selected_results'] = result['selected_results'][:2]
+        if not result['reasoning']:
+            result['reasoning'] = f"Based on the {'presence' if result['selected_results'] else 'absence'} of selected results and the overall content."
+        if not result['response']:
+            result['response'] = result.get('reasoning', 'No clear response found.')
+        return result
+    def _match_section_to_key(self, section: str) -> Union[str, None]:
+        for key, pattern in self.section_identifiers:
+            if re.search(pattern, section, re.IGNORECASE):
+                return key
+        return None
+    def _extract_numbers(self, text: str) -> List[int]:
+        return [int(num) for num in re.findall(r'\b(?:10|[1-9])\b', text)]
+    def _infer_decision(self, text: str) -> str:
+        text = text.lower()
+        refine_score = sum(text.count(keyword) for keyword in self.decision_keywords['refine'])
+        answer_score = sum(text.count(keyword) for keyword in self.decision_keywords['answer'])
+        return 'refine' if refine_score > answer_score else 'answer'
+    def _is_valid_result(self, result: Dict[str, Union[str, List[int]]]) -> bool:
+        return bool(result.get('decision') or result.get('response') or result.get('selected_results'))
+# Example usage
+if __name__ == "__main__":
+    parser = UltimateLLMResponseParser()
+    test_response = """
+    Decision: answer
+    Reasoning: The scraped content provides comprehensive information about recent AI breakthroughs.
+    Selected Results: 1, 3
+    Response: Based on the scraped content, there have been several significant breakthroughs in AI recently...
+    """
+    parsed_result = parser.parse_llm_response(test_response)
+    print(json.dumps(parsed_result, indent=2))

llm_wrapper.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from llama_cpp import Llama
+import requests
+import json
+from llm_config import get_llm_config
+class LLMWrapper:
+    def __init__(self):
+        self.llm_config = get_llm_config()
+        self.llm_type = self.llm_config.get('llm_type', 'llama_cpp')
+        if self.llm_type == 'llama_cpp':
+            self.llm = self._initialize_llama_cpp()
+        elif self.llm_type == 'ollama':
+            self.base_url = self.llm_config.get('base_url', 'http://localhost:11434')
+            self.model_name = self.llm_config.get('model_name', 'your_model_name')
+        else:
+            raise ValueError(f"Unsupported LLM type: {self.llm_type}")
+    def _initialize_llama_cpp(self):
+        if self.llm_config.get('model_path') is None:
+            return Llama.from_pretrained(
+                repo_id="Tien203/llama.cpp",
+                filename="Llama-2-7b-hf-q4_0.gguf",
+            )
+        else:
+            return Llama(
+                model_path=self.llm_config.get('model_path'),
+                n_ctx=self.llm_config.get('n_ctx', 2048),
+                n_gpu_layers=self.llm_config.get('n_gpu_layers', 0),
+                n_threads=self.llm_config.get('n_threads', 8),
+                verbose=False
+            )
+    def generate(self, prompt, **kwargs):
+        if self.llm_type == 'llama_cpp':
+            llama_kwargs = self._prepare_llama_kwargs(kwargs)
+            response = self.llm(prompt, **llama_kwargs)
+            return response['choices'][0]['text'].strip()
+        elif self.llm_type == 'ollama':
+            return self._ollama_generate(prompt, **kwargs)
+        else:
+            raise ValueError(f"Unsupported LLM type: {self.llm_type}")
+    def _ollama_generate(self, prompt, **kwargs):
+        url = f"{self.base_url}/api/generate"
+        data = {
+            'model': self.model_name,
+            'prompt': prompt,
+            'options': {
+                'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
+                'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
+                'stop': kwargs.get('stop', self.llm_config.get('stop', [])),
+                'num_predict': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 1024)),
+            }
+        }
+        response = requests.post(url, json=data, stream=True)
+        if response.status_code != 200:
+            raise Exception(f"Ollama API request failed with status {response.status_code}: {response.text}")
+        text = ''.join(json.loads(line)['response'] for line in response.iter_lines() if line)
+        return text.strip()
+    def _prepare_llama_kwargs(self, kwargs):
+        llama_kwargs = {
+            'max_tokens': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 1024)),
+            'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
+            'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
+            'stop': kwargs.get('stop', self.llm_config.get('stop', [])),
+            'echo': False,
+        }
+        return llama_kwargs

web_scraper.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import requests
+from bs4 import BeautifulSoup
+from urllib.robotparser import RobotFileParser
+from urllib.parse import urlparse, urljoin
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import re
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class WebScraper:
+    def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)",
+                 rate_limit=1, timeout=10, max_retries=3):
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": user_agent})
+        self.robot_parser = RobotFileParser()
+        self.rate_limit = rate_limit
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.last_request_time = {}
+    def can_fetch(self, url):
+        parsed_url = urlparse(url)
+        robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
+        self.robot_parser.set_url(robots_url)
+        try:
+            self.robot_parser.read()
+            return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url)
+        except Exception as e:
+            logger.warning(f"Error reading robots.txt for {url}: {e}")
+            return True  # Assume allowed if robots.txt can't be read
+    def respect_rate_limit(self, url):
+        domain = urlparse(url).netloc
+        current_time = time.time()
+        if domain in self.last_request_time:
+            time_since_last_request = current_time - self.last_request_time[domain]
+            if time_since_last_request < self.rate_limit:
+                time.sleep(self.rate_limit - time_since_last_request)
+        self.last_request_time[domain] = time.time()
+    def scrape_page(self, url):
+        if not self.can_fetch(url):
+            logger.info(f"Robots.txt disallows scraping: {url}")
+            return None
+        for attempt in range(self.max_retries):
+            try:
+                self.respect_rate_limit(url)
+                response = self.session.get(url, timeout=self.timeout)
+                response.raise_for_status()
+                return self.extract_content(response.text, url)
+            except requests.RequestException as e:
+                logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}")
+                if attempt == self.max_retries - 1:
+                    logger.error(f"Failed to scrape {url} after {self.max_retries} attempts")
+                    return None
+                time.sleep(2 ** attempt)  # Exponential backoff
+    def extract_content(self, html, url):
+        soup = BeautifulSoup(html, 'html.parser')
+        # Remove unwanted elements
+        for element in soup(["script", "style", "nav", "footer", "header"]):
+            element.decompose()
+        # Extract title
+        title = soup.title.string if soup.title else ""
+        # Try to find main content
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
+        if main_content:
+            paragraphs = main_content.find_all('p')
+        else:
+            paragraphs = soup.find_all('p')
+        # Extract text from paragraphs
+        text = ' '.join([p.get_text().strip() for p in paragraphs])
+        # If no paragraphs found, get all text
+        if not text:
+            text = soup.get_text()
+        # Clean up whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Extract and resolve links
+        links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
+        return {
+            "url": url,
+            "title": title,
+            "content": text[:2400],  # Limit to first 2400 characters
+            "links": links[:10]  # Limit to first 10 links
+        }
+def scrape_multiple_pages(urls, max_workers=5):
+    scraper = WebScraper()
+    results = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls}
+        for future in as_completed(future_to_url):
+            url = future_to_url[future]
+            try:
+                data = future.result()
+                if data:
+                    results[url] = data
+                    logger.info(f"Successfully scraped: {url}")
+                else:
+                    logger.warning(f"Failed to scrape: {url}")
+            except Exception as exc:
+                logger.error(f"{url} generated an exception: {exc}")
+    return results
+# Function to integrate with your main system
+def get_web_content(urls):
+    scraped_data = scrape_multiple_pages(urls)
+    return {url: data['content'] for url, data in scraped_data.items() if data}
+# Standalone can_fetch function
+def can_fetch(url):
+    parsed_url = urlparse(url)
+    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
+    rp = RobotFileParser()
+    rp.set_url(robots_url)
+    try:
+        rp.read()
+        return rp.can_fetch("*", url)
+    except Exception as e:
+        logger.warning(f"Error reading robots.txt for {url}: {e}")
+        return True  # Assume allowed if robots.txt can't be read
+if __name__ == "__main__":
+    test_urls = [
+        "https://en.wikipedia.org/wiki/Web_scraping",
+        "https://example.com",
+        "https://www.python.org"
+    ]
+    scraped_content = get_web_content(test_urls)
+    for url, content in scraped_content.items():
+        print(f"Content from {url}:")
+        print(content[:500])  # Print first 500 characters
+        print("\n---\n")