minar09 commited on
Commit
0a69927
·
verified ·
1 Parent(s): 92ecfe5

Upload 5 files

Browse files
Files changed (5) hide show
  1. Self_Improving_Search.py +431 -0
  2. llm_config.py +39 -0
  3. llm_response_parser.py +177 -0
  4. llm_wrapper.py +69 -0
  5. web_scraper.py +149 -0
Self_Improving_Search.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import re
3
+ import os
4
+ from typing import List, Dict, Tuple, Union
5
+ from colorama import Fore, Style
6
+ import logging
7
+ import sys
8
+ from io import StringIO
9
+ from web_scraper import get_web_content, can_fetch
10
+ from llm_config import get_llm_config
11
+ from llm_response_parser import UltimateLLMResponseParser
12
+ from llm_wrapper import LLMWrapper
13
+ from urllib.parse import urlparse
14
+
15
+ # Set up logging
16
+ log_directory = 'logs'
17
+ if not os.path.exists(log_directory):
18
+ os.makedirs(log_directory)
19
+
20
+ # Configure logger
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(logging.INFO)
23
+ log_file = os.path.join(log_directory, 'llama_output.log')
24
+ file_handler = logging.FileHandler(log_file)
25
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
26
+ file_handler.setFormatter(formatter)
27
+ logger.handlers = []
28
+ logger.addHandler(file_handler)
29
+ logger.propagate = False
30
+
31
+ # Suppress other loggers
32
+ for name in ['root', 'duckduckgo_search', 'requests', 'urllib3']:
33
+ logging.getLogger(name).setLevel(logging.WARNING)
34
+ logging.getLogger(name).handlers = []
35
+ logging.getLogger(name).propagate = False
36
+
37
+ class OutputRedirector:
38
+ def __init__(self, stream=None):
39
+ self.stream = stream or StringIO()
40
+ self.original_stdout = sys.stdout
41
+ self.original_stderr = sys.stderr
42
+
43
+ def __enter__(self):
44
+ sys.stdout = self.stream
45
+ sys.stderr = self.stream
46
+ return self.stream
47
+
48
+ def __exit__(self, exc_type, exc_val, exc_tb):
49
+ sys.stdout = self.original_stdout
50
+ sys.stderr = self.original_stderr
51
+
52
+ class EnhancedSelfImprovingSearch:
53
+ def __init__(self, llm: LLMWrapper, parser: UltimateLLMResponseParser, max_attempts: int = 5):
54
+ self.llm = llm
55
+ self.parser = parser
56
+ self.max_attempts = max_attempts
57
+ self.llm_config = get_llm_config()
58
+
59
+ @staticmethod
60
+ def initialize_llm():
61
+ llm_wrapper = LLMWrapper()
62
+ return llm_wrapper
63
+
64
+ def print_thinking(self):
65
+ print(Fore.MAGENTA + "🧠 Thinking..." + Style.RESET_ALL)
66
+
67
+ def print_searching(self):
68
+ print(Fore.MAGENTA + "📝 Searching..." + Style.RESET_ALL)
69
+
70
+ def search_and_improve(self, user_query: str) -> str:
71
+ attempt = 0
72
+ while attempt < self.max_attempts:
73
+ print(f"\n{Fore.CYAN}Search attempt {attempt + 1}:{Style.RESET_ALL}")
74
+ self.print_searching()
75
+
76
+ try:
77
+ formulated_query, time_range = self.formulate_query(user_query, attempt)
78
+
79
+ print(f"{Fore.YELLOW}Original query: {user_query}{Style.RESET_ALL}")
80
+ print(f"{Fore.YELLOW}Formulated query: {formulated_query}{Style.RESET_ALL}")
81
+ print(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}")
82
+
83
+ if not formulated_query:
84
+ print(f"{Fore.RED}Error: Empty search query. Retrying...{Style.RESET_ALL}")
85
+ attempt += 1
86
+ continue
87
+
88
+ search_results = self.perform_search(formulated_query, time_range)
89
+
90
+ if not search_results:
91
+ print(f"{Fore.RED}No results found. Retrying with a different query...{Style.RESET_ALL}")
92
+ attempt += 1
93
+ continue
94
+
95
+ self.display_search_results(search_results)
96
+
97
+ selected_urls = self.select_relevant_pages(search_results, user_query)
98
+
99
+ if not selected_urls:
100
+ print(f"{Fore.RED}No relevant URLs found. Retrying...{Style.RESET_ALL}")
101
+ attempt += 1
102
+ continue
103
+
104
+ print(Fore.MAGENTA + "⚙️ Scraping selected pages..." + Style.RESET_ALL)
105
+ # Scraping is done without OutputRedirector to ensure messages are visible
106
+ scraped_content = self.scrape_content(selected_urls)
107
+
108
+ if not scraped_content:
109
+ print(f"{Fore.RED}Failed to scrape content. Retrying...{Style.RESET_ALL}")
110
+ attempt += 1
111
+ continue
112
+
113
+ self.display_scraped_content(scraped_content)
114
+
115
+ self.print_thinking()
116
+
117
+ with OutputRedirector() as output:
118
+ evaluation, decision = self.evaluate_scraped_content(user_query, scraped_content)
119
+ llm_output = output.getvalue()
120
+ logger.info(f"LLM Output in evaluate_scraped_content:\n{llm_output}")
121
+
122
+ print(f"{Fore.MAGENTA}Evaluation: {evaluation}{Style.RESET_ALL}")
123
+ print(f"{Fore.MAGENTA}Decision: {decision}{Style.RESET_ALL}")
124
+
125
+ if decision == "answer":
126
+ return self.generate_final_answer(user_query, scraped_content)
127
+ elif decision == "refine":
128
+ print(f"{Fore.YELLOW}Refining search...{Style.RESET_ALL}")
129
+ attempt += 1
130
+ else:
131
+ print(f"{Fore.RED}Unexpected decision. Proceeding to answer.{Style.RESET_ALL}")
132
+ return self.generate_final_answer(user_query, scraped_content)
133
+
134
+ except Exception as e:
135
+ print(f"{Fore.RED}An error occurred during search attempt. Check the log file for details.{Style.RESET_ALL}")
136
+ logger.error(f"An error occurred during search: {str(e)}", exc_info=True)
137
+ attempt += 1
138
+
139
+ return self.synthesize_final_answer(user_query)
140
+
141
+ def evaluate_scraped_content(self, user_query: str, scraped_content: Dict[str, str]) -> Tuple[str, str]:
142
+ user_query_short = user_query[:200]
143
+ prompt = f"""
144
+ Evaluate if the following scraped content contains sufficient information to answer the user's question comprehensively:
145
+
146
+ User's question: "{user_query_short}"
147
+
148
+ Scraped Content:
149
+ {self.format_scraped_content(scraped_content)}
150
+
151
+ Your task:
152
+ 1. Determine if the scraped content provides enough relevant and detailed information to answer the user's question thoroughly.
153
+ 2. If the information is sufficient, decide to 'answer'. If more information or clarification is needed, decide to 'refine' the search.
154
+
155
+ Respond using EXACTLY this format:
156
+ Evaluation: [Your evaluation of the scraped content]
157
+ Decision: [ONLY 'answer' if content is sufficient, or 'refine' if more information is needed]
158
+ """
159
+ max_retries = 3
160
+ for attempt in range(max_retries):
161
+ try:
162
+ response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
163
+ evaluation, decision = self.parse_evaluation_response(response_text)
164
+ if decision in ['answer', 'refine']:
165
+ return evaluation, decision
166
+ except Exception as e:
167
+ logger.warning(f"Error in evaluate_scraped_content (attempt {attempt + 1}): {str(e)}")
168
+
169
+ logger.warning("Failed to get a valid decision in evaluate_scraped_content. Defaulting to 'refine'.")
170
+ return "Failed to evaluate content.", "refine"
171
+
172
+ def parse_evaluation_response(self, response: str) -> Tuple[str, str]:
173
+ evaluation = ""
174
+ decision = ""
175
+ for line in response.strip().split('\n'):
176
+ if line.startswith('Evaluation:'):
177
+ evaluation = line.split(':', 1)[1].strip()
178
+ elif line.startswith('Decision:'):
179
+ decision = line.split(':', 1)[1].strip().lower()
180
+ return evaluation, decision
181
+
182
+ def formulate_query(self, user_query: str, attempt: int) -> Tuple[str, str]:
183
+ user_query_short = user_query[:200]
184
+ prompt = f"""
185
+ Based on the following user question, formulate a concise and effective search query:
186
+ "{user_query_short}"
187
+ Your task:
188
+ 1. Create a search query of 2-5 words that will yield relevant results.
189
+ 2. Determine if a specific time range is needed for the search.
190
+ Time range options:
191
+ - 'd': Limit results to the past day. Use for very recent events or rapidly changing information.
192
+ - 'w': Limit results to the past week. Use for recent events or topics with frequent updates.
193
+ - 'm': Limit results to the past month. Use for relatively recent information or ongoing events.
194
+ - 'y': Limit results to the past year. Use for annual events or information that changes yearly.
195
+ - 'none': No time limit. Use for historical information or topics not tied to a specific time frame.
196
+ Respond in the following format:
197
+ Search query: [Your 2-5 word query]
198
+ Time range: [d/w/m/y/none]
199
+ Do not provide any additional information or explanation.
200
+ """
201
+ max_retries = 3
202
+ for retry in range(max_retries):
203
+ with OutputRedirector() as output:
204
+ response_text = self.llm.generate(prompt, max_tokens=50, stop=None)
205
+ llm_output = output.getvalue()
206
+ logger.info(f"LLM Output in formulate_query:\n{llm_output}")
207
+ query, time_range = self.parse_query_response(response_text)
208
+ if query and time_range:
209
+ return query, time_range
210
+ return self.fallback_query(user_query), "none"
211
+
212
+ def parse_query_response(self, response: str) -> Tuple[str, str]:
213
+ query = ""
214
+ time_range = "none"
215
+ for line in response.strip().split('\n'):
216
+ if ":" in line:
217
+ key, value = line.split(":", 1)
218
+ key = key.strip().lower()
219
+ value = value.strip()
220
+ if "query" in key:
221
+ query = self.clean_query(value)
222
+ elif "time" in key or "range" in key:
223
+ time_range = self.validate_time_range(value)
224
+ return query, time_range
225
+
226
+ def clean_query(self, query: str) -> str:
227
+ query = re.sub(r'["\'\[\]]', '', query)
228
+ query = re.sub(r'\s+', ' ', query)
229
+ return query.strip()[:100]
230
+
231
+ def validate_time_range(self, time_range: str) -> str:
232
+ valid_ranges = ['d', 'w', 'm', 'y', 'none']
233
+ time_range = time_range.lower()
234
+ return time_range if time_range in valid_ranges else 'none'
235
+
236
+ def fallback_query(self, user_query: str) -> str:
237
+ words = user_query.split()
238
+ return " ".join(words[:5])
239
+
240
+ def perform_search(self, query: str, time_range: str) -> List[Dict]:
241
+ if not query:
242
+ return []
243
+
244
+ from duckduckgo_search import DDGS
245
+
246
+ with DDGS() as ddgs:
247
+ try:
248
+ with OutputRedirector() as output:
249
+ if time_range and time_range != 'none':
250
+ results = list(ddgs.text(query, timelimit=time_range, max_results=10))
251
+ else:
252
+ results = list(ddgs.text(query, max_results=10))
253
+ ddg_output = output.getvalue()
254
+ logger.info(f"DDG Output in perform_search:\n{ddg_output}")
255
+ print(f"{Fore.GREEN}Search query sent to DuckDuckGo: {query}{Style.RESET_ALL}")
256
+ print(f"{Fore.GREEN}Time range sent to DuckDuckGo: {time_range}{Style.RESET_ALL}")
257
+ print(f"{Fore.GREEN}Number of results: {len(results)}{Style.RESET_ALL}")
258
+ return [{'number': i+1, **result} for i, result in enumerate(results)]
259
+ except Exception as e:
260
+ print(f"{Fore.RED}Search error: {str(e)}{Style.RESET_ALL}")
261
+ return []
262
+
263
+ def display_search_results(self, results: List[Dict]):
264
+ print(f"\n{Fore.CYAN}Search Results:{Style.RESET_ALL}")
265
+ for result in results:
266
+ print(f"{Fore.GREEN}Result {result['number']}:{Style.RESET_ALL}")
267
+ print(f"Title: {result.get('title', 'N/A')}")
268
+ print(f"Snippet: {result.get('body', 'N/A')[:200]}...")
269
+ print(f"URL: {result.get('href', 'N/A')}\n")
270
+
271
+ def select_relevant_pages(self, search_results: List[Dict], user_query: str) -> List[str]:
272
+ prompt = f"""
273
+ Given the following search results for the user's question: "{user_query}"
274
+ Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection.
275
+
276
+ Search Results:
277
+ {self.format_results(search_results)}
278
+
279
+ Instructions:
280
+ 1. You MUST select exactly 2 result numbers from the search results.
281
+ 2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question.
282
+ 3. Provide a brief reason for each selection.
283
+
284
+ You MUST respond using EXACTLY this format and nothing else:
285
+
286
+ Selected Results: [Two numbers corresponding to the selected results]
287
+ Reasoning: [Your reasoning for the selections]
288
+ """
289
+
290
+ max_retries = 3
291
+ for retry in range(max_retries):
292
+ with OutputRedirector() as output:
293
+ response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
294
+ llm_output = output.getvalue()
295
+ logger.info(f"LLM Output in select_relevant_pages:\n{llm_output}")
296
+
297
+ parsed_response = self.parse_page_selection_response(response_text)
298
+ if parsed_response and self.validate_page_selection_response(parsed_response, len(search_results)):
299
+ selected_urls = [result['href'] for result in search_results if result['number'] in parsed_response['selected_results']]
300
+
301
+ allowed_urls = [url for url in selected_urls if can_fetch(url)]
302
+ if allowed_urls:
303
+ return allowed_urls
304
+ else:
305
+ print(f"{Fore.YELLOW}Warning: All selected URLs are disallowed by robots.txt. Retrying selection.{Style.RESET_ALL}")
306
+ else:
307
+ print(f"{Fore.YELLOW}Warning: Invalid page selection. Retrying.{Style.RESET_ALL}")
308
+
309
+ print(f"{Fore.YELLOW}Warning: All attempts to select relevant pages failed. Falling back to top allowed results.{Style.RESET_ALL}")
310
+ allowed_urls = [result['href'] for result in search_results if can_fetch(result['href'])][:2]
311
+ return allowed_urls
312
+
313
+ def parse_page_selection_response(self, response: str) -> Dict[str, Union[List[int], str]]:
314
+ lines = response.strip().split('\n')
315
+ parsed = {}
316
+ for line in lines:
317
+ if line.startswith('Selected Results:'):
318
+ parsed['selected_results'] = [int(num.strip()) for num in re.findall(r'\d+', line)]
319
+ elif line.startswith('Reasoning:'):
320
+ parsed['reasoning'] = line.split(':', 1)[1].strip()
321
+ return parsed if 'selected_results' in parsed and 'reasoning' in parsed else None
322
+
323
+ def validate_page_selection_response(self, parsed_response: Dict[str, Union[List[int], str]], num_results: int) -> bool:
324
+ if len(parsed_response['selected_results']) != 2:
325
+ return False
326
+ if any(num < 1 or num > num_results for num in parsed_response['selected_results']):
327
+ return False
328
+ return True
329
+
330
+ def format_results(self, results: List[Dict]) -> str:
331
+ formatted_results = []
332
+ for result in results:
333
+ formatted_result = f"{result['number']}. Title: {result.get('title', 'N/A')}\n"
334
+ formatted_result += f" Snippet: {result.get('body', 'N/A')[:200]}...\n"
335
+ formatted_result += f" URL: {result.get('href', 'N/A')}\n"
336
+ formatted_results.append(formatted_result)
337
+ return "\n".join(formatted_results)
338
+
339
+ def scrape_content(self, urls: List[str]) -> Dict[str, str]:
340
+ scraped_content = {}
341
+ blocked_urls = []
342
+ for url in urls:
343
+ robots_allowed = can_fetch(url)
344
+ if robots_allowed:
345
+ content = get_web_content([url])
346
+ if content:
347
+ scraped_content.update(content)
348
+ print(Fore.YELLOW + f"Successfully scraped: {url}" + Style.RESET_ALL)
349
+ logger.info(f"Successfully scraped: {url}")
350
+ else:
351
+ print(Fore.RED + f"Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
352
+ logger.warning(f"Robots.txt disallows scraping of {url}")
353
+ else:
354
+ blocked_urls.append(url)
355
+ print(Fore.RED + f"Warning: Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
356
+ logger.warning(f"Robots.txt disallows scraping of {url}")
357
+
358
+ print(Fore.CYAN + f"Scraped content received for {len(scraped_content)} URLs" + Style.RESET_ALL)
359
+ logger.info(f"Scraped content received for {len(scraped_content)} URLs")
360
+
361
+ if blocked_urls:
362
+ print(Fore.RED + f"Warning: {len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions." + Style.RESET_ALL)
363
+ logger.warning(f"{len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions: {', '.join(blocked_urls)}")
364
+
365
+ return scraped_content
366
+
367
+ def display_scraped_content(self, scraped_content: Dict[str, str]):
368
+ print(f"\n{Fore.CYAN}Scraped Content:{Style.RESET_ALL}")
369
+ for url, content in scraped_content.items():
370
+ print(f"{Fore.GREEN}URL: {url}{Style.RESET_ALL}")
371
+ print(f"Content: {content[:4000]}...\n")
372
+
373
+ def generate_final_answer(self, user_query: str, scraped_content: Dict[str, str]) -> str:
374
+ user_query_short = user_query[:200]
375
+ prompt = f"""
376
+ You are an AI assistant. Provide a comprehensive and detailed answer to the following question using ONLY the information provided in the scraped content. Do not include any references or mention any sources. Answer directly and thoroughly.
377
+
378
+ Question: "{user_query_short}"
379
+
380
+ Scraped Content:
381
+ {self.format_scraped_content(scraped_content)}
382
+
383
+ Important Instructions:
384
+ 1. Do not use phrases like "Based on the absence of selected results" or similar.
385
+ 2. If the scraped content does not contain enough information to answer the question, say so explicitly and explain what information is missing.
386
+ 3. Provide as much relevant detail as possible from the scraped content.
387
+
388
+ Answer:
389
+ """
390
+ max_retries = 3
391
+ for attempt in range(max_retries):
392
+ with OutputRedirector() as output:
393
+ response_text = self.llm.generate(prompt, max_tokens=1024, stop=None)
394
+ llm_output = output.getvalue()
395
+ logger.info(f"LLM Output in generate_final_answer:\n{llm_output}")
396
+ if response_text:
397
+ logger.info(f"LLM Response:\n{response_text}")
398
+ return response_text
399
+
400
+ error_message = "I apologize, but I couldn't generate a satisfactory answer based on the available information."
401
+ logger.warning(f"Failed to generate a response after {max_retries} attempts. Returning error message.")
402
+ return error_message
403
+
404
+ def format_scraped_content(self, scraped_content: Dict[str, str]) -> str:
405
+ formatted_content = []
406
+ for url, content in scraped_content.items():
407
+ content = re.sub(r'\s+', ' ', content)
408
+ formatted_content.append(f"Content from {url}:\n{content}\n")
409
+ return "\n".join(formatted_content)
410
+
411
+ def synthesize_final_answer(self, user_query: str) -> str:
412
+ prompt = f"""
413
+ After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "{user_query}"
414
+
415
+ Please provide the best possible answer you can, acknowledging any limitations or uncertainties.
416
+ If appropriate, suggest ways the user might refine their question or where they might find more information.
417
+
418
+ Respond in a clear, concise, and informative manner.
419
+ """
420
+ try:
421
+ with OutputRedirector() as output:
422
+ response_text = self.llm.generate(prompt, max_tokens=self.llm_config.get('max_tokens', 1024), stop=self.llm_config.get('stop', None))
423
+ llm_output = output.getvalue()
424
+ logger.info(f"LLM Output in synthesize_final_answer:\n{llm_output}")
425
+ if response_text:
426
+ return response_text.strip()
427
+ except Exception as e:
428
+ logger.error(f"Error in synthesize_final_answer: {str(e)}", exc_info=True)
429
+ return "I apologize, but after multiple attempts, I wasn't able to find a satisfactory answer to your question. Please try rephrasing your question or breaking it down into smaller, more specific queries."
430
+
431
+ # End of EnhancedSelfImprovingSearch class
llm_config.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llm_config.py
2
+
3
+ LLM_TYPE = "llama_cpp" # Options: 'llama_cpp', 'ollama'
4
+
5
+ # LLM settings for llama_cpp
6
+ MODEL_PATH = None # "/filepath/to/your/llama.cpp/model" # Replace with your llama.cpp models filepath
7
+
8
+ LLM_CONFIG_LLAMA_CPP = {
9
+ "llm_type": "llama_cpp",
10
+ "model_path": MODEL_PATH,
11
+ "n_ctx": 20000, # context size
12
+ "n_gpu_layers": 0, # number of layers to offload to GPU (-1 for all, 0 for none)
13
+ "n_threads": 8, # number of threads to use
14
+ "temperature": 0.7, # temperature for sampling
15
+ "top_p": 0.9, # top p for sampling
16
+ "top_k": 40, # top k for sampling
17
+ "repeat_penalty": 1.1, # repeat penalty
18
+ "max_tokens": 1024, # max tokens to generate
19
+ "stop": ["User:", "\n\n"] # stop sequences
20
+ }
21
+
22
+ # LLM settings for Ollama
23
+ LLM_CONFIG_OLLAMA = {
24
+ "llm_type": "ollama",
25
+ "base_url": "http://localhost:11434", # default Ollama server URL
26
+ "model_name": "ollama model name", # Replace with your Ollama model name
27
+ "temperature": 0.7,
28
+ "top_p": 0.9,
29
+ "n_ctx": 20000, # context size
30
+ "stop": ["User:", "\n\n"]
31
+ }
32
+
33
+ def get_llm_config():
34
+ if LLM_TYPE == "llama_cpp":
35
+ return LLM_CONFIG_LLAMA_CPP
36
+ elif LLM_TYPE == "ollama":
37
+ return LLM_CONFIG_OLLAMA
38
+ else:
39
+ raise ValueError(f"Invalid LLM_TYPE: {LLM_TYPE}")
llm_response_parser.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List, Union
3
+ import logging
4
+ import json
5
+
6
+ # Set up logging
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class UltimateLLMResponseParser:
11
+ def __init__(self):
12
+ self.decision_keywords = {
13
+ 'refine': ['refine', 'need more info', 'insufficient', 'unclear', 'more research', 'additional search'],
14
+ 'answer': ['answer', 'sufficient', 'enough info', 'can respond', 'adequate', 'comprehensive']
15
+ }
16
+ self.section_identifiers = [
17
+ ('decision', r'(?i)decision\s*:'),
18
+ ('reasoning', r'(?i)reasoning\s*:'),
19
+ ('selected_results', r'(?i)selected results\s*:'),
20
+ ('response', r'(?i)response\s*:')
21
+ ]
22
+
23
+ def parse_llm_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
24
+ logger.info("Starting to parse LLM response")
25
+
26
+ # Initialize result dictionary
27
+ result = {
28
+ 'decision': None,
29
+ 'reasoning': None,
30
+ 'selected_results': [],
31
+ 'response': None
32
+ }
33
+
34
+ # Define parsing strategies
35
+ parsing_strategies = [
36
+ self._parse_structured_response,
37
+ self._parse_json_response,
38
+ self._parse_unstructured_response,
39
+ self._parse_implicit_response
40
+ ]
41
+
42
+ # Try each parsing strategy
43
+ for strategy in parsing_strategies:
44
+ try:
45
+ parsed_result = strategy(response)
46
+ if self._is_valid_result(parsed_result):
47
+ result.update(parsed_result)
48
+ logger.info(f"Successfully parsed using strategy: {strategy.__name__}")
49
+ break
50
+ except Exception as e:
51
+ logger.warning(f"Error in parsing strategy {strategy.__name__}: {str(e)}")
52
+
53
+ # If no strategy succeeded, use fallback parsing
54
+ if not self._is_valid_result(result):
55
+ logger.warning("All parsing strategies failed. Using fallback parsing.")
56
+ result = self._fallback_parsing(response)
57
+
58
+ # Post-process the result
59
+ result = self._post_process_result(result)
60
+
61
+ logger.info("Finished parsing LLM response")
62
+ return result
63
+
64
+ def _parse_structured_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
65
+ result = {}
66
+ for key, pattern in self.section_identifiers:
67
+ match = re.search(f'{pattern}(.*?)(?={"|".join([p for k, p in self.section_identifiers if k != key])}|$)', response, re.IGNORECASE | re.DOTALL)
68
+ if match:
69
+ result[key] = match.group(1).strip()
70
+
71
+ if 'selected_results' in result:
72
+ result['selected_results'] = self._extract_numbers(result['selected_results'])
73
+
74
+ return result
75
+
76
+ def _parse_json_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
77
+ try:
78
+ json_match = re.search(r'\{.*\}', response, re.DOTALL)
79
+ if json_match:
80
+ json_str = json_match.group(0)
81
+ parsed_json = json.loads(json_str)
82
+ return {k: v for k, v in parsed_json.items() if k in ['decision', 'reasoning', 'selected_results', 'response']}
83
+ except json.JSONDecodeError:
84
+ pass
85
+ return {}
86
+
87
+ def _parse_unstructured_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
88
+ result = {}
89
+ lines = response.split('\n')
90
+ current_section = None
91
+
92
+ for line in lines:
93
+ section_match = re.match(r'(.+?)[:.-](.+)', line)
94
+ if section_match:
95
+ key = self._match_section_to_key(section_match.group(1))
96
+ if key:
97
+ current_section = key
98
+ result[key] = section_match.group(2).strip()
99
+ elif current_section:
100
+ result[current_section] += ' ' + line.strip()
101
+
102
+ if 'selected_results' in result:
103
+ result['selected_results'] = self._extract_numbers(result['selected_results'])
104
+
105
+ return result
106
+
107
+ def _parse_implicit_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
108
+ result = {}
109
+
110
+ decision = self._infer_decision(response)
111
+ if decision:
112
+ result['decision'] = decision
113
+
114
+ numbers = self._extract_numbers(response)
115
+ if numbers:
116
+ result['selected_results'] = numbers
117
+
118
+ if not result:
119
+ result['response'] = response.strip()
120
+
121
+ return result
122
+
123
+ def _fallback_parsing(self, response: str) -> Dict[str, Union[str, List[int]]]:
124
+ result = {
125
+ 'decision': self._infer_decision(response),
126
+ 'reasoning': None,
127
+ 'selected_results': self._extract_numbers(response),
128
+ 'response': response.strip()
129
+ }
130
+ return result
131
+
132
+ def _post_process_result(self, result: Dict[str, Union[str, List[int]]]) -> Dict[str, Union[str, List[int]]]:
133
+ if result['decision'] not in ['refine', 'answer']:
134
+ result['decision'] = self._infer_decision(str(result))
135
+
136
+ if not isinstance(result['selected_results'], list):
137
+ result['selected_results'] = self._extract_numbers(str(result['selected_results']))
138
+
139
+ result['selected_results'] = result['selected_results'][:2]
140
+
141
+ if not result['reasoning']:
142
+ result['reasoning'] = f"Based on the {'presence' if result['selected_results'] else 'absence'} of selected results and the overall content."
143
+
144
+ if not result['response']:
145
+ result['response'] = result.get('reasoning', 'No clear response found.')
146
+
147
+ return result
148
+
149
+ def _match_section_to_key(self, section: str) -> Union[str, None]:
150
+ for key, pattern in self.section_identifiers:
151
+ if re.search(pattern, section, re.IGNORECASE):
152
+ return key
153
+ return None
154
+
155
+ def _extract_numbers(self, text: str) -> List[int]:
156
+ return [int(num) for num in re.findall(r'\b(?:10|[1-9])\b', text)]
157
+
158
+ def _infer_decision(self, text: str) -> str:
159
+ text = text.lower()
160
+ refine_score = sum(text.count(keyword) for keyword in self.decision_keywords['refine'])
161
+ answer_score = sum(text.count(keyword) for keyword in self.decision_keywords['answer'])
162
+ return 'refine' if refine_score > answer_score else 'answer'
163
+
164
+ def _is_valid_result(self, result: Dict[str, Union[str, List[int]]]) -> bool:
165
+ return bool(result.get('decision') or result.get('response') or result.get('selected_results'))
166
+
167
+ # Example usage
168
+ if __name__ == "__main__":
169
+ parser = UltimateLLMResponseParser()
170
+ test_response = """
171
+ Decision: answer
172
+ Reasoning: The scraped content provides comprehensive information about recent AI breakthroughs.
173
+ Selected Results: 1, 3
174
+ Response: Based on the scraped content, there have been several significant breakthroughs in AI recently...
175
+ """
176
+ parsed_result = parser.parse_llm_response(test_response)
177
+ print(json.dumps(parsed_result, indent=2))
llm_wrapper.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ import requests
3
+ import json
4
+ from llm_config import get_llm_config
5
+
6
+ class LLMWrapper:
7
+ def __init__(self):
8
+ self.llm_config = get_llm_config()
9
+ self.llm_type = self.llm_config.get('llm_type', 'llama_cpp')
10
+ if self.llm_type == 'llama_cpp':
11
+ self.llm = self._initialize_llama_cpp()
12
+ elif self.llm_type == 'ollama':
13
+ self.base_url = self.llm_config.get('base_url', 'http://localhost:11434')
14
+ self.model_name = self.llm_config.get('model_name', 'your_model_name')
15
+ else:
16
+ raise ValueError(f"Unsupported LLM type: {self.llm_type}")
17
+
18
+ def _initialize_llama_cpp(self):
19
+ if self.llm_config.get('model_path') is None:
20
+ return Llama.from_pretrained(
21
+ repo_id="Tien203/llama.cpp",
22
+ filename="Llama-2-7b-hf-q4_0.gguf",
23
+ )
24
+ else:
25
+ return Llama(
26
+ model_path=self.llm_config.get('model_path'),
27
+ n_ctx=self.llm_config.get('n_ctx', 2048),
28
+ n_gpu_layers=self.llm_config.get('n_gpu_layers', 0),
29
+ n_threads=self.llm_config.get('n_threads', 8),
30
+ verbose=False
31
+ )
32
+
33
+ def generate(self, prompt, **kwargs):
34
+ if self.llm_type == 'llama_cpp':
35
+ llama_kwargs = self._prepare_llama_kwargs(kwargs)
36
+ response = self.llm(prompt, **llama_kwargs)
37
+ return response['choices'][0]['text'].strip()
38
+ elif self.llm_type == 'ollama':
39
+ return self._ollama_generate(prompt, **kwargs)
40
+ else:
41
+ raise ValueError(f"Unsupported LLM type: {self.llm_type}")
42
+
43
+ def _ollama_generate(self, prompt, **kwargs):
44
+ url = f"{self.base_url}/api/generate"
45
+ data = {
46
+ 'model': self.model_name,
47
+ 'prompt': prompt,
48
+ 'options': {
49
+ 'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
50
+ 'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
51
+ 'stop': kwargs.get('stop', self.llm_config.get('stop', [])),
52
+ 'num_predict': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 1024)),
53
+ }
54
+ }
55
+ response = requests.post(url, json=data, stream=True)
56
+ if response.status_code != 200:
57
+ raise Exception(f"Ollama API request failed with status {response.status_code}: {response.text}")
58
+ text = ''.join(json.loads(line)['response'] for line in response.iter_lines() if line)
59
+ return text.strip()
60
+
61
+ def _prepare_llama_kwargs(self, kwargs):
62
+ llama_kwargs = {
63
+ 'max_tokens': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 1024)),
64
+ 'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
65
+ 'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
66
+ 'stop': kwargs.get('stop', self.llm_config.get('stop', [])),
67
+ 'echo': False,
68
+ }
69
+ return llama_kwargs
web_scraper.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.robotparser import RobotFileParser
4
+ from urllib.parse import urlparse, urljoin
5
+ import time
6
+ import logging
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ import re
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class WebScraper:
15
+ def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)",
16
+ rate_limit=1, timeout=10, max_retries=3):
17
+ self.session = requests.Session()
18
+ self.session.headers.update({"User-Agent": user_agent})
19
+ self.robot_parser = RobotFileParser()
20
+ self.rate_limit = rate_limit
21
+ self.timeout = timeout
22
+ self.max_retries = max_retries
23
+ self.last_request_time = {}
24
+
25
+ def can_fetch(self, url):
26
+ parsed_url = urlparse(url)
27
+ robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
28
+ self.robot_parser.set_url(robots_url)
29
+ try:
30
+ self.robot_parser.read()
31
+ return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url)
32
+ except Exception as e:
33
+ logger.warning(f"Error reading robots.txt for {url}: {e}")
34
+ return True # Assume allowed if robots.txt can't be read
35
+
36
+ def respect_rate_limit(self, url):
37
+ domain = urlparse(url).netloc
38
+ current_time = time.time()
39
+ if domain in self.last_request_time:
40
+ time_since_last_request = current_time - self.last_request_time[domain]
41
+ if time_since_last_request < self.rate_limit:
42
+ time.sleep(self.rate_limit - time_since_last_request)
43
+ self.last_request_time[domain] = time.time()
44
+
45
+ def scrape_page(self, url):
46
+ if not self.can_fetch(url):
47
+ logger.info(f"Robots.txt disallows scraping: {url}")
48
+ return None
49
+
50
+ for attempt in range(self.max_retries):
51
+ try:
52
+ self.respect_rate_limit(url)
53
+ response = self.session.get(url, timeout=self.timeout)
54
+ response.raise_for_status()
55
+ return self.extract_content(response.text, url)
56
+ except requests.RequestException as e:
57
+ logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}")
58
+ if attempt == self.max_retries - 1:
59
+ logger.error(f"Failed to scrape {url} after {self.max_retries} attempts")
60
+ return None
61
+ time.sleep(2 ** attempt) # Exponential backoff
62
+
63
+ def extract_content(self, html, url):
64
+ soup = BeautifulSoup(html, 'html.parser')
65
+
66
+ # Remove unwanted elements
67
+ for element in soup(["script", "style", "nav", "footer", "header"]):
68
+ element.decompose()
69
+
70
+ # Extract title
71
+ title = soup.title.string if soup.title else ""
72
+
73
+ # Try to find main content
74
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
75
+
76
+ if main_content:
77
+ paragraphs = main_content.find_all('p')
78
+ else:
79
+ paragraphs = soup.find_all('p')
80
+
81
+ # Extract text from paragraphs
82
+ text = ' '.join([p.get_text().strip() for p in paragraphs])
83
+
84
+ # If no paragraphs found, get all text
85
+ if not text:
86
+ text = soup.get_text()
87
+
88
+ # Clean up whitespace
89
+ text = re.sub(r'\s+', ' ', text).strip()
90
+
91
+ # Extract and resolve links
92
+ links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
93
+
94
+ return {
95
+ "url": url,
96
+ "title": title,
97
+ "content": text[:2400], # Limit to first 2400 characters
98
+ "links": links[:10] # Limit to first 10 links
99
+ }
100
+
101
+ def scrape_multiple_pages(urls, max_workers=5):
102
+ scraper = WebScraper()
103
+ results = {}
104
+
105
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
106
+ future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls}
107
+ for future in as_completed(future_to_url):
108
+ url = future_to_url[future]
109
+ try:
110
+ data = future.result()
111
+ if data:
112
+ results[url] = data
113
+ logger.info(f"Successfully scraped: {url}")
114
+ else:
115
+ logger.warning(f"Failed to scrape: {url}")
116
+ except Exception as exc:
117
+ logger.error(f"{url} generated an exception: {exc}")
118
+
119
+ return results
120
+
121
+ # Function to integrate with your main system
122
+ def get_web_content(urls):
123
+ scraped_data = scrape_multiple_pages(urls)
124
+ return {url: data['content'] for url, data in scraped_data.items() if data}
125
+
126
+ # Standalone can_fetch function
127
+ def can_fetch(url):
128
+ parsed_url = urlparse(url)
129
+ robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
130
+ rp = RobotFileParser()
131
+ rp.set_url(robots_url)
132
+ try:
133
+ rp.read()
134
+ return rp.can_fetch("*", url)
135
+ except Exception as e:
136
+ logger.warning(f"Error reading robots.txt for {url}: {e}")
137
+ return True # Assume allowed if robots.txt can't be read
138
+
139
+ if __name__ == "__main__":
140
+ test_urls = [
141
+ "https://en.wikipedia.org/wiki/Web_scraping",
142
+ "https://example.com",
143
+ "https://www.python.org"
144
+ ]
145
+ scraped_content = get_web_content(test_urls)
146
+ for url, content in scraped_content.items():
147
+ print(f"Content from {url}:")
148
+ print(content[:500]) # Print first 500 characters
149
+ print("\n---\n")