Spaces:
Sleeping
Sleeping
Delete LLM
Browse files- LLM/__init__.py +0 -13
- LLM/image_answerer.py +0 -136
- LLM/llm_handler.py +0 -216
- LLM/one_shotter.py +0 -218
- LLM/tabular_answer.py +0 -128
LLM/__init__.py
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
# LLM Handler Package
|
2 |
-
|
3 |
-
from .llm_handler import llm_handler
|
4 |
-
from .tabular_answer import get_answer_for_tabluar
|
5 |
-
from .image_answerer import get_answer_for_image
|
6 |
-
from .one_shotter import get_oneshot_answer
|
7 |
-
|
8 |
-
__all__ = [
|
9 |
-
'llm_handler',
|
10 |
-
'get_answer_for_tabluar',
|
11 |
-
'get_answer_for_image',
|
12 |
-
'get_oneshot_answer'
|
13 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LLM/image_answerer.py
DELETED
@@ -1,136 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import requests
|
3 |
-
import google.generativeai as genai
|
4 |
-
from PIL import Image
|
5 |
-
from io import BytesIO
|
6 |
-
from typing import List, Union
|
7 |
-
import logging
|
8 |
-
|
9 |
-
from dotenv import load_dotenv
|
10 |
-
|
11 |
-
# Set up logging
|
12 |
-
logging.basicConfig(level=logging.INFO)
|
13 |
-
logger = logging.getLogger(__name__)
|
14 |
-
|
15 |
-
load_dotenv()
|
16 |
-
|
17 |
-
# Configure Gemini API for image processing
|
18 |
-
genai.configure(api_key=os.getenv("GEMINI_API_KEY_IMAGE"))
|
19 |
-
|
20 |
-
def load_image(image_source: str) -> Image.Image:
|
21 |
-
"""Load image from a URL or local path."""
|
22 |
-
try:
|
23 |
-
if image_source.startswith("http://") or image_source.startswith("https://"):
|
24 |
-
logger.info(f"Loading image from URL: {image_source}")
|
25 |
-
response = requests.get(image_source, timeout=30)
|
26 |
-
response.raise_for_status()
|
27 |
-
return Image.open(BytesIO(response.content)).convert("RGB")
|
28 |
-
elif os.path.isfile(image_source):
|
29 |
-
logger.info(f"Loading image from file: {image_source}")
|
30 |
-
return Image.open(image_source).convert("RGB")
|
31 |
-
else:
|
32 |
-
raise ValueError("Invalid image source: must be a valid URL or file path")
|
33 |
-
except Exception as e:
|
34 |
-
logger.error(f"Failed to load image from {image_source}: {e}")
|
35 |
-
raise RuntimeError(f"Failed to load image: {e}")
|
36 |
-
|
37 |
-
def get_answer_for_image(image_source: str, questions: List[str], retries: int = 3) -> List[str]:
|
38 |
-
"""Ask questions about an image using Gemini Vision model."""
|
39 |
-
try:
|
40 |
-
logger.info(f"Processing image with {len(questions)} questions")
|
41 |
-
image = load_image(image_source)
|
42 |
-
|
43 |
-
prompt = """
|
44 |
-
Answer the following questions about the image. Give the answers in the same order as the questions.
|
45 |
-
Answers should be descriptive and detailed. Give one answer per line with numbering as "1. 2. 3. ..".
|
46 |
-
|
47 |
-
Example answer format:
|
48 |
-
1. Answer 1, Explanation
|
49 |
-
2. Answer 2, Explanation
|
50 |
-
3. Answer 3, Explanation
|
51 |
-
|
52 |
-
Questions:
|
53 |
-
"""
|
54 |
-
prompt += "\n".join(f"{i+1}. {q}" for i, q in enumerate(questions))
|
55 |
-
|
56 |
-
model = genai.GenerativeModel("gemini-1.5-flash")
|
57 |
-
|
58 |
-
for attempt in range(retries):
|
59 |
-
try:
|
60 |
-
logger.info(f"Attempt {attempt + 1} of {retries} to get response from Gemini")
|
61 |
-
response = model.generate_content(
|
62 |
-
[prompt, image],
|
63 |
-
generation_config=genai.types.GenerationConfig(
|
64 |
-
temperature=0.4,
|
65 |
-
max_output_tokens=2048
|
66 |
-
)
|
67 |
-
)
|
68 |
-
raw_text = response.text.strip()
|
69 |
-
logger.info(f"Received response from Gemini: {len(raw_text)} characters")
|
70 |
-
|
71 |
-
answers = extract_ordered_answers(raw_text, len(questions))
|
72 |
-
if len(answers) == len(questions):
|
73 |
-
logger.info(f"Successfully extracted {len(answers)} answers")
|
74 |
-
return answers
|
75 |
-
else:
|
76 |
-
logger.warning(f"Expected {len(questions)} answers, got {len(answers)}")
|
77 |
-
|
78 |
-
except Exception as e:
|
79 |
-
logger.error(f"Attempt {attempt + 1} failed: {e}")
|
80 |
-
if attempt == retries - 1:
|
81 |
-
raise RuntimeError(f"Failed after {retries} attempts: {e}")
|
82 |
-
|
83 |
-
raise RuntimeError("Failed to get valid response from Gemini.")
|
84 |
-
|
85 |
-
except Exception as e:
|
86 |
-
logger.error(f"Error in get_answer_for_image: {e}")
|
87 |
-
raise
|
88 |
-
|
89 |
-
def extract_ordered_answers(raw_text: str, expected_count: int) -> List[str]:
|
90 |
-
"""Parse the raw Gemini output into a clean list of answers."""
|
91 |
-
import re
|
92 |
-
|
93 |
-
logger.debug(f"Extracting {expected_count} answers from raw text")
|
94 |
-
lines = raw_text.splitlines()
|
95 |
-
answers = []
|
96 |
-
|
97 |
-
for line in lines:
|
98 |
-
# Match numbered lines: "1. Answer", "1) Answer", "1 - Answer", etc.
|
99 |
-
match = re.match(r"^\s*(\d+)[\).\s-]*\s*(.+)", line)
|
100 |
-
if match:
|
101 |
-
answer_text = match.group(2).strip()
|
102 |
-
if answer_text: # Only add non-empty answers
|
103 |
-
answers.append(answer_text)
|
104 |
-
|
105 |
-
# Fallback: if numbering failed, use plain lines
|
106 |
-
if len(answers) < expected_count:
|
107 |
-
logger.warning("Numbered extraction failed, using fallback method")
|
108 |
-
answers = [line.strip() for line in lines if line.strip()]
|
109 |
-
|
110 |
-
# Return exactly the expected number of answers
|
111 |
-
result = answers[:expected_count]
|
112 |
-
|
113 |
-
# If we still don't have enough answers, pad with error messages
|
114 |
-
while len(result) < expected_count:
|
115 |
-
result.append("Unable to extract answer from image")
|
116 |
-
|
117 |
-
logger.info(f"Extracted {len(result)} answers")
|
118 |
-
return result
|
119 |
-
|
120 |
-
def process_image_query(image_path: str, query: str) -> str:
|
121 |
-
"""Process a single query about an image."""
|
122 |
-
try:
|
123 |
-
questions = [query]
|
124 |
-
answers = get_answer_for_image(image_path, questions)
|
125 |
-
return answers[0] if answers else "Unable to process image query"
|
126 |
-
except Exception as e:
|
127 |
-
logger.error(f"Error processing image query: {e}")
|
128 |
-
return f"Error processing image: {str(e)}"
|
129 |
-
|
130 |
-
def process_multiple_image_queries(image_path: str, queries: List[str]) -> List[str]:
|
131 |
-
"""Process multiple queries about an image."""
|
132 |
-
try:
|
133 |
-
return get_answer_for_image(image_path, queries)
|
134 |
-
except Exception as e:
|
135 |
-
logger.error(f"Error processing multiple image queries: {e}")
|
136 |
-
return [f"Error processing image: {str(e)}"] * len(queries)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LLM/llm_handler.py
DELETED
@@ -1,216 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Multi-LLM Handler with failover support
|
3 |
-
Uses Groq, Gemini, and OpenAI with automatic failover for reliability
|
4 |
-
"""
|
5 |
-
|
6 |
-
import asyncio
|
7 |
-
import time
|
8 |
-
from typing import Optional, Dict, Any, List
|
9 |
-
import os
|
10 |
-
import requests
|
11 |
-
import google.generativeai as genai
|
12 |
-
import openai
|
13 |
-
from dotenv import load_dotenv
|
14 |
-
from config.config import get_provider_configs
|
15 |
-
|
16 |
-
load_dotenv()
|
17 |
-
|
18 |
-
class MultiLLMHandler:
|
19 |
-
"""Multi-LLM handler with automatic failover across providers."""
|
20 |
-
|
21 |
-
def __init__(self):
|
22 |
-
"""Initialize the multi-LLM handler with all available providers."""
|
23 |
-
self.providers = get_provider_configs()
|
24 |
-
self.current_provider = None
|
25 |
-
self.current_config = None
|
26 |
-
|
27 |
-
# Initialize the first available provider (prefer Gemini/OpenAI for general RAG)
|
28 |
-
self._initialize_provider()
|
29 |
-
|
30 |
-
print(f"✅ Initialized Multi-LLM Handler with {self.provider.upper()}: {self.model_name}")
|
31 |
-
|
32 |
-
def _initialize_provider(self):
|
33 |
-
"""Initialize the first available provider."""
|
34 |
-
# Prefer Gemini first for general text tasks
|
35 |
-
if self.providers["gemini"]:
|
36 |
-
self.current_provider = "gemini"
|
37 |
-
self.current_config = self.providers["gemini"][0]
|
38 |
-
genai.configure(api_key=self.current_config["api_key"])
|
39 |
-
# Then OpenAI
|
40 |
-
elif self.providers["openai"]:
|
41 |
-
self.current_provider = "openai"
|
42 |
-
self.current_config = self.providers["openai"][0]
|
43 |
-
openai.api_key = self.current_config["api_key"]
|
44 |
-
# Finally Groq
|
45 |
-
elif self.providers["groq"]:
|
46 |
-
self.current_provider = "groq"
|
47 |
-
self.current_config = self.providers["groq"][0]
|
48 |
-
else:
|
49 |
-
raise ValueError("No LLM providers available with valid API keys")
|
50 |
-
|
51 |
-
@property
|
52 |
-
def provider(self):
|
53 |
-
"""Get current provider name."""
|
54 |
-
return self.current_provider
|
55 |
-
|
56 |
-
@property
|
57 |
-
def model_name(self):
|
58 |
-
"""Get current model name."""
|
59 |
-
return self.current_config["model"] if self.current_config else "unknown"
|
60 |
-
|
61 |
-
async def _call_groq(self, prompt: str, temperature: float, max_tokens: int) -> str:
|
62 |
-
"""Call Groq API."""
|
63 |
-
headers = {
|
64 |
-
"Authorization": f"Bearer {self.current_config['api_key']}",
|
65 |
-
"Content-Type": "application/json"
|
66 |
-
}
|
67 |
-
|
68 |
-
data = {
|
69 |
-
"model": self.current_config["model"],
|
70 |
-
"messages": [{"role": "user", "content": prompt}],
|
71 |
-
"temperature": temperature,
|
72 |
-
"max_tokens": max_tokens
|
73 |
-
}
|
74 |
-
|
75 |
-
response = requests.post(
|
76 |
-
"https://api.groq.com/openai/v1/chat/completions",
|
77 |
-
headers=headers,
|
78 |
-
json=data,
|
79 |
-
timeout=30
|
80 |
-
)
|
81 |
-
response.raise_for_status()
|
82 |
-
|
83 |
-
result = response.json()
|
84 |
-
return result["choices"][0]["message"]["content"].strip()
|
85 |
-
|
86 |
-
async def _call_gemini(self, prompt: str, temperature: float, max_tokens: int) -> str:
|
87 |
-
"""Call Gemini API."""
|
88 |
-
model = genai.GenerativeModel(self.current_config["model"])
|
89 |
-
|
90 |
-
generation_config = genai.types.GenerationConfig(
|
91 |
-
temperature=temperature,
|
92 |
-
max_output_tokens=max_tokens
|
93 |
-
)
|
94 |
-
|
95 |
-
response = await asyncio.to_thread(
|
96 |
-
model.generate_content,
|
97 |
-
prompt,
|
98 |
-
generation_config=generation_config
|
99 |
-
)
|
100 |
-
return response.text.strip()
|
101 |
-
|
102 |
-
async def _call_openai(self, prompt: str, temperature: float, max_tokens: int) -> str:
|
103 |
-
"""Call OpenAI API."""
|
104 |
-
response = await asyncio.to_thread(
|
105 |
-
openai.ChatCompletion.create,
|
106 |
-
model=self.current_config["model"],
|
107 |
-
messages=[{"role": "user", "content": prompt}],
|
108 |
-
temperature=temperature,
|
109 |
-
max_tokens=max_tokens
|
110 |
-
)
|
111 |
-
return response.choices[0].message.content.strip()
|
112 |
-
|
113 |
-
async def _try_with_failover(self, prompt: str, temperature: float, max_tokens: int) -> str:
|
114 |
-
"""Try to generate text with automatic failover."""
|
115 |
-
# Get all available providers in order
|
116 |
-
provider_order = []
|
117 |
-
# Prefer Gemini -> OpenAI -> Groq for general text
|
118 |
-
if self.providers["gemini"]:
|
119 |
-
provider_order.extend([("gemini", config) for config in self.providers["gemini"]])
|
120 |
-
if self.providers["openai"]:
|
121 |
-
provider_order.extend([("openai", config) for config in self.providers["openai"]])
|
122 |
-
if self.providers["groq"]:
|
123 |
-
provider_order.extend([("groq", config) for config in self.providers["groq"]])
|
124 |
-
|
125 |
-
last_error = None
|
126 |
-
|
127 |
-
for provider_name, config in provider_order:
|
128 |
-
try:
|
129 |
-
# Set current provider
|
130 |
-
old_provider = self.current_provider
|
131 |
-
old_config = self.current_config
|
132 |
-
|
133 |
-
self.current_provider = provider_name
|
134 |
-
self.current_config = config
|
135 |
-
|
136 |
-
# Configure API if needed
|
137 |
-
if provider_name == "gemini":
|
138 |
-
genai.configure(api_key=config["api_key"])
|
139 |
-
elif provider_name == "openai":
|
140 |
-
openai.api_key = config["api_key"]
|
141 |
-
|
142 |
-
# Try the API call
|
143 |
-
if provider_name == "groq":
|
144 |
-
return await self._call_groq(prompt, temperature, max_tokens)
|
145 |
-
elif provider_name == "gemini":
|
146 |
-
return await self._call_gemini(prompt, temperature, max_tokens)
|
147 |
-
elif provider_name == "openai":
|
148 |
-
return await self._call_openai(prompt, temperature, max_tokens)
|
149 |
-
|
150 |
-
except Exception as e:
|
151 |
-
print(f"⚠️ {provider_name.upper()} ({config['name']}) failed: {str(e)}")
|
152 |
-
last_error = e
|
153 |
-
|
154 |
-
# Restore previous provider
|
155 |
-
self.current_provider = old_provider
|
156 |
-
self.current_config = old_config
|
157 |
-
continue
|
158 |
-
|
159 |
-
# If all providers failed
|
160 |
-
raise RuntimeError(f"All LLM providers failed. Last error: {last_error}")
|
161 |
-
|
162 |
-
async def generate_text(self,
|
163 |
-
prompt: Optional[str] = None,
|
164 |
-
system_prompt: Optional[str] = None,
|
165 |
-
user_prompt: Optional[str] = None,
|
166 |
-
temperature: Optional[float] = 0.4,
|
167 |
-
max_tokens: Optional[int] = 1200) -> str:
|
168 |
-
"""Generate text using multi-LLM with failover."""
|
169 |
-
# Handle both single prompt and system/user prompt formats
|
170 |
-
if prompt:
|
171 |
-
final_prompt = prompt
|
172 |
-
elif system_prompt and user_prompt:
|
173 |
-
final_prompt = f"{system_prompt}\n\n{user_prompt}"
|
174 |
-
elif user_prompt:
|
175 |
-
final_prompt = user_prompt
|
176 |
-
else:
|
177 |
-
raise ValueError("Must provide either 'prompt' or 'user_prompt'")
|
178 |
-
|
179 |
-
return await self._try_with_failover(
|
180 |
-
final_prompt,
|
181 |
-
temperature or 0.4,
|
182 |
-
max_tokens or 1200
|
183 |
-
)
|
184 |
-
|
185 |
-
async def generate_simple(self,
|
186 |
-
prompt: str,
|
187 |
-
temperature: Optional[float] = 0.4,
|
188 |
-
max_tokens: Optional[int] = 1200) -> str:
|
189 |
-
"""Simple text generation (alias for generate_text for compatibility)."""
|
190 |
-
return await self.generate_text(prompt=prompt, temperature=temperature, max_tokens=max_tokens)
|
191 |
-
|
192 |
-
def get_provider_info(self) -> Dict[str, Any]:
|
193 |
-
"""Get information about the current provider."""
|
194 |
-
return {
|
195 |
-
"provider": self.current_provider,
|
196 |
-
"model": self.model_name,
|
197 |
-
"config_name": self.current_config["name"] if self.current_config else "none",
|
198 |
-
"available_providers": {
|
199 |
-
"groq": len(self.providers["groq"]),
|
200 |
-
"gemini": len(self.providers["gemini"]),
|
201 |
-
"openai": len(self.providers["openai"])
|
202 |
-
}
|
203 |
-
}
|
204 |
-
|
205 |
-
async def test_connection(self) -> bool:
|
206 |
-
"""Test the connection to the current LLM provider."""
|
207 |
-
try:
|
208 |
-
test_prompt = "Say 'Hello' if you can read this."
|
209 |
-
response = await self.generate_simple(test_prompt, temperature=0.1, max_tokens=10)
|
210 |
-
return "hello" in response.lower()
|
211 |
-
except Exception as e:
|
212 |
-
print(f"❌ Connection test failed: {str(e)}")
|
213 |
-
return False
|
214 |
-
|
215 |
-
# Create a global instance
|
216 |
-
llm_handler = MultiLLMHandler()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LLM/one_shotter.py
DELETED
@@ -1,218 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import asyncio
|
3 |
-
from typing import List, Dict
|
4 |
-
from urllib.parse import urlparse
|
5 |
-
import httpx
|
6 |
-
from bs4 import BeautifulSoup
|
7 |
-
import os
|
8 |
-
from dotenv import load_dotenv
|
9 |
-
|
10 |
-
load_dotenv()
|
11 |
-
|
12 |
-
# Import our multi-LLM handler
|
13 |
-
from LLM.llm_handler import llm_handler
|
14 |
-
|
15 |
-
# URL extraction pattern (same as ShastraDocs)
|
16 |
-
URL_PATTERN = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
17 |
-
|
18 |
-
def extract_urls_from_text(text: str) -> List[str]:
|
19 |
-
urls = URL_PATTERN.findall(text or "")
|
20 |
-
seen = set()
|
21 |
-
clean_urls = []
|
22 |
-
for url in urls:
|
23 |
-
clean_url = url.rstrip('.,;:!?)')
|
24 |
-
if clean_url and clean_url not in seen and validate_url(clean_url):
|
25 |
-
seen.add(clean_url)
|
26 |
-
clean_urls.append(clean_url)
|
27 |
-
return clean_urls
|
28 |
-
|
29 |
-
def validate_url(url: str) -> bool:
|
30 |
-
try:
|
31 |
-
result = urlparse(url)
|
32 |
-
return bool(result.scheme and result.netloc)
|
33 |
-
except Exception:
|
34 |
-
return False
|
35 |
-
|
36 |
-
async def scrape_url(url: str, max_chars: int = 4000) -> Dict[str, str]:
|
37 |
-
"""Async URL scraping using httpx + BeautifulSoup (FastAPI-friendly)."""
|
38 |
-
try:
|
39 |
-
timeout = httpx.Timeout(20.0)
|
40 |
-
headers = {
|
41 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
42 |
-
}
|
43 |
-
async with httpx.AsyncClient(timeout=timeout, headers=headers, follow_redirects=True) as client:
|
44 |
-
resp = await client.get(url)
|
45 |
-
resp.raise_for_status()
|
46 |
-
soup = BeautifulSoup(resp.content, 'html.parser')
|
47 |
-
for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
|
48 |
-
tag.decompose()
|
49 |
-
text_content = soup.get_text(separator=' ', strip=True)
|
50 |
-
cleaned = ' '.join(text_content.split())
|
51 |
-
if len(cleaned) > max_chars:
|
52 |
-
cleaned = cleaned[:max_chars] + "..."
|
53 |
-
return {
|
54 |
-
'url': url,
|
55 |
-
'content': cleaned,
|
56 |
-
'status': 'success',
|
57 |
-
'length': len(cleaned),
|
58 |
-
'title': soup.title.string if soup.title else 'No title'
|
59 |
-
}
|
60 |
-
except httpx.TimeoutException:
|
61 |
-
return {'url': url, 'content': 'Timeout error', 'status': 'timeout', 'length': 0, 'title': 'Timeout'}
|
62 |
-
except Exception as e:
|
63 |
-
return {'url': url, 'content': f'Error: {str(e)[:100]}', 'status': 'error', 'length': 0, 'title': 'Error'}
|
64 |
-
|
65 |
-
async def scrape_urls(urls: List[str], max_chars: int = 4000) -> List[Dict[str, str]]:
|
66 |
-
if not urls:
|
67 |
-
return []
|
68 |
-
sem = asyncio.Semaphore(5)
|
69 |
-
async def _scrape(u):
|
70 |
-
async with sem:
|
71 |
-
return await scrape_url(u, max_chars)
|
72 |
-
results = await asyncio.gather(*[_scrape(u) for u in urls], return_exceptions=True)
|
73 |
-
final = []
|
74 |
-
for i, r in enumerate(results):
|
75 |
-
if isinstance(r, Exception):
|
76 |
-
final.append({'url': urls[i], 'content': f'Exception: {str(r)[:100]}', 'status': 'exception', 'length': 0, 'title': 'Exception'})
|
77 |
-
else:
|
78 |
-
final.append(r)
|
79 |
-
return final
|
80 |
-
|
81 |
-
def build_additional_content(scrapes: List[Dict[str, str]]) -> str:
|
82 |
-
parts = []
|
83 |
-
for r in scrapes:
|
84 |
-
if r.get('status') == 'success' and r.get('length', 0) > 50:
|
85 |
-
parts.append("\n" + "="*50)
|
86 |
-
parts.append(f"SOURCE: Additional Source")
|
87 |
-
parts.append(f"URL: {r.get('url','')}")
|
88 |
-
parts.append(f"TITLE: {r.get('title','No title')}")
|
89 |
-
parts.append("-"*30 + " CONTENT " + "-"*30)
|
90 |
-
parts.append(r.get('content',''))
|
91 |
-
parts.append("="*50)
|
92 |
-
return "\n".join(parts)
|
93 |
-
|
94 |
-
def parse_numbered_answers(text: str, expected_count: int) -> List[str]:
|
95 |
-
"""Parse numbered answers, with sane fallbacks."""
|
96 |
-
pattern = re.compile(r'^\s*(\d+)[\).\-]\s*(.+)$', re.MULTILINE)
|
97 |
-
matches = pattern.findall(text or "")
|
98 |
-
result: Dict[int, str] = {}
|
99 |
-
for num_str, answer in matches:
|
100 |
-
try:
|
101 |
-
num = int(num_str)
|
102 |
-
if 1 <= num <= expected_count:
|
103 |
-
clean_answer = re.sub(r'\s+', ' ', answer).strip()
|
104 |
-
if clean_answer:
|
105 |
-
result[num] = clean_answer
|
106 |
-
except Exception:
|
107 |
-
continue
|
108 |
-
answers: List[str] = []
|
109 |
-
for i in range(1, expected_count + 1):
|
110 |
-
answers.append(result.get(i, f"Unable to find answer for question {i}"))
|
111 |
-
return answers
|
112 |
-
|
113 |
-
def parse_answers_from_json(raw: str, expected_count: int) -> List[str]:
|
114 |
-
import json, re
|
115 |
-
# Try direct JSON
|
116 |
-
try:
|
117 |
-
obj = json.loads(raw)
|
118 |
-
if isinstance(obj, dict) and isinstance(obj.get('answers'), list):
|
119 |
-
out = [str(x).strip() for x in obj['answers']][:expected_count]
|
120 |
-
while len(out) < expected_count:
|
121 |
-
out.append(f"Unable to find answer for question {len(out)+1}")
|
122 |
-
return out
|
123 |
-
except Exception:
|
124 |
-
pass
|
125 |
-
# Try to extract JSON fragment
|
126 |
-
m = re.search(r'\{[^\{\}]*"answers"[^\{\}]*\}', raw or "", re.DOTALL)
|
127 |
-
if m:
|
128 |
-
try:
|
129 |
-
obj = json.loads(m.group(0))
|
130 |
-
if isinstance(obj, dict) and isinstance(obj.get('answers'), list):
|
131 |
-
out = [str(x).strip() for x in obj['answers']][:expected_count]
|
132 |
-
while len(out) < expected_count:
|
133 |
-
out.append(f"Unable to find answer for question {len(out)+1}")
|
134 |
-
return out
|
135 |
-
except Exception:
|
136 |
-
pass
|
137 |
-
# Fallback to numbered parsing
|
138 |
-
return parse_numbered_answers(raw or "", expected_count)
|
139 |
-
|
140 |
-
async def get_oneshot_answer(content: str, questions: List[str]) -> List[str]:
|
141 |
-
"""
|
142 |
-
Enhanced oneshot QA flow with ShastraDocs-style URL extraction and scraping.
|
143 |
-
- Extract URLs from content and questions
|
144 |
-
- Scrape relevant pages
|
145 |
-
- Merge additional content and feed to LLM
|
146 |
-
- Return per-question answers
|
147 |
-
"""
|
148 |
-
if not questions:
|
149 |
-
return []
|
150 |
-
|
151 |
-
try:
|
152 |
-
# Build numbered questions
|
153 |
-
numbered_questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
|
154 |
-
|
155 |
-
# Find URLs from content and questions
|
156 |
-
combined = (content or "") + "\n" + "\n".join(questions or [])
|
157 |
-
found_urls = extract_urls_from_text(combined)
|
158 |
-
|
159 |
-
# Special case: content starts with URL marker
|
160 |
-
if content.startswith("URL for Context:"):
|
161 |
-
only_url = content.replace("URL for Context:", "").strip()
|
162 |
-
if validate_url(only_url):
|
163 |
-
if only_url not in found_urls:
|
164 |
-
found_urls.insert(0, only_url)
|
165 |
-
|
166 |
-
# Scrape URLs if any
|
167 |
-
additional_content = ""
|
168 |
-
if found_urls:
|
169 |
-
print(f"🚀 Scraping {len(found_urls)} URL(s) for additional context...")
|
170 |
-
scrape_results = await scrape_urls(found_urls, max_chars=4000)
|
171 |
-
additional_content = build_additional_content(scrape_results)
|
172 |
-
print(f"📄 Additional content length: {len(additional_content)}")
|
173 |
-
|
174 |
-
# Merge final context
|
175 |
-
if additional_content:
|
176 |
-
final_context = (content or "") + "\n\nADDITIONAL INFORMATION FROM SCRAPED SOURCES:\n" + additional_content
|
177 |
-
else:
|
178 |
-
final_context = content or ""
|
179 |
-
|
180 |
-
print(f"📊 Final context length: {len(final_context)}")
|
181 |
-
|
182 |
-
# Prompts (ask for JSON answers to improve parsing)
|
183 |
-
system_prompt = (
|
184 |
-
"You are an expert assistant. Read ALL provided context (including any 'ADDITIONAL INFORMATION FROM\n"
|
185 |
-
"SCRAPED SOURCES') and answer the questions comprehensively. If info is missing, say so."
|
186 |
-
)
|
187 |
-
|
188 |
-
user_prompt = f"""FULL CONTEXT:
|
189 |
-
{final_context[:8000]}{"..." if len(final_context) > 8000 else ""}
|
190 |
-
|
191 |
-
QUESTIONS:
|
192 |
-
{numbered_questions}
|
193 |
-
|
194 |
-
Respond in this EXACT JSON format:
|
195 |
-
{{
|
196 |
-
"answers": [
|
197 |
-
"<Answer to question 1>",
|
198 |
-
"<Answer to question 2>",
|
199 |
-
"<Answer to question 3>"
|
200 |
-
]
|
201 |
-
}}"""
|
202 |
-
|
203 |
-
print(f"🤖 Using {llm_handler.provider.upper()} model: {llm_handler.model_name}")
|
204 |
-
raw = await llm_handler.generate_text(
|
205 |
-
system_prompt=system_prompt,
|
206 |
-
user_prompt=user_prompt,
|
207 |
-
temperature=0.4,
|
208 |
-
max_tokens=1800
|
209 |
-
)
|
210 |
-
|
211 |
-
print(f"🔄 LLM response length: {len(raw) if raw else 0}")
|
212 |
-
answers = parse_answers_from_json(raw, len(questions))
|
213 |
-
print(f"✅ Parsed {len(answers)} answers")
|
214 |
-
return answers
|
215 |
-
|
216 |
-
except Exception as e:
|
217 |
-
print(f"❌ Error in oneshot answer generation: {str(e)}")
|
218 |
-
return [f"Error processing question: {str(e)}" for _ in questions]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LLM/tabular_answer.py
DELETED
@@ -1,128 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import re
|
3 |
-
import math
|
4 |
-
from typing import List
|
5 |
-
from langchain_core.messages import SystemMessage, HumanMessage
|
6 |
-
from langchain_groq import ChatGroq
|
7 |
-
from dotenv import load_dotenv
|
8 |
-
|
9 |
-
load_dotenv()
|
10 |
-
TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES")
|
11 |
-
|
12 |
-
# Initialize Groq LLM for tabular data using specialized API key
|
13 |
-
TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b"))
|
14 |
-
GROQ_LLM = ChatGroq(
|
15 |
-
groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")),
|
16 |
-
model_name=TABULAR_MODEL
|
17 |
-
)
|
18 |
-
|
19 |
-
def get_answer_for_tabluar(
|
20 |
-
data: str,
|
21 |
-
questions: List[str],
|
22 |
-
batch_size: int = 10,
|
23 |
-
verbose: bool = False
|
24 |
-
) -> List[str]:
|
25 |
-
"""
|
26 |
-
Query Groq LLM for tabular data analysis, handling batches and preserving order of answers.
|
27 |
-
|
28 |
-
Args:
|
29 |
-
data (str): Tabular context in markdown or plain-text.
|
30 |
-
questions (List[str]): List of questions to ask.
|
31 |
-
batch_size (int): Max number of questions per batch.
|
32 |
-
verbose (bool): If True, print raw LLM responses.
|
33 |
-
|
34 |
-
Returns:
|
35 |
-
List[str]: Ordered list of answers corresponding to input questions.
|
36 |
-
"""
|
37 |
-
|
38 |
-
def parse_numbered_answers(text: str, expected: int) -> List[str]:
|
39 |
-
"""
|
40 |
-
Parse answers from a numbered list format ('1.', '2.', etc.)
|
41 |
-
Use non-greedy capture with lookahead to stop at the next number or end.
|
42 |
-
"""
|
43 |
-
pattern = re.compile(r"^\s*(\d{1,2})[\.)\-]\s*(.*?)(?=\n\s*\d{1,2}[\.)\-]\s*|$)", re.MULTILINE | re.DOTALL)
|
44 |
-
matches = pattern.findall(text)
|
45 |
-
|
46 |
-
result = {}
|
47 |
-
for num_str, answer in matches:
|
48 |
-
try:
|
49 |
-
num = int(num_str)
|
50 |
-
except ValueError:
|
51 |
-
continue
|
52 |
-
if 1 <= num <= expected:
|
53 |
-
clean_answer = re.sub(r'\s+', ' ', answer).strip()
|
54 |
-
result[num] = clean_answer
|
55 |
-
|
56 |
-
# If no structured matches, fall back to line-based heuristic
|
57 |
-
if not result:
|
58 |
-
lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()]
|
59 |
-
for i in range(min(expected, len(lines))):
|
60 |
-
result[i + 1] = lines[i]
|
61 |
-
|
62 |
-
# Build fixed-length list
|
63 |
-
answers = []
|
64 |
-
for i in range(1, expected + 1):
|
65 |
-
answers.append(result.get(i, f"Unable to answer question {i}"))
|
66 |
-
|
67 |
-
return answers
|
68 |
-
|
69 |
-
if not questions:
|
70 |
-
return []
|
71 |
-
|
72 |
-
# Process questions in batches
|
73 |
-
all_answers = []
|
74 |
-
total_batches = math.ceil(len(questions) / batch_size)
|
75 |
-
|
76 |
-
for batch_idx in range(total_batches):
|
77 |
-
start = batch_idx * batch_size
|
78 |
-
end = min(start + batch_size, len(questions))
|
79 |
-
batch_questions = questions[start:end]
|
80 |
-
|
81 |
-
print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)")
|
82 |
-
|
83 |
-
# Create numbered question list
|
84 |
-
numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)])
|
85 |
-
|
86 |
-
# Create prompt
|
87 |
-
system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately.
|
88 |
-
|
89 |
-
Instructions:
|
90 |
-
- Answer each question based ONLY on the data provided
|
91 |
-
- If data is insufficient, state "Information not available in the provided data"
|
92 |
-
- Provide clear, concise answers
|
93 |
-
- Format your response as a numbered list (1., 2., 3., etc.)
|
94 |
-
- Do not add explanations unless specifically asked"""
|
95 |
-
|
96 |
-
user_prompt = f"""Data:
|
97 |
-
{data}
|
98 |
-
|
99 |
-
Questions:
|
100 |
-
{numbered_questions}
|
101 |
-
|
102 |
-
Please provide numbered answers (1., 2., 3., etc.) for each question."""
|
103 |
-
|
104 |
-
try:
|
105 |
-
# Create messages
|
106 |
-
messages = [
|
107 |
-
SystemMessage(content=system_prompt),
|
108 |
-
HumanMessage(content=user_prompt)
|
109 |
-
]
|
110 |
-
|
111 |
-
# Get response from LLM
|
112 |
-
response = GROQ_LLM.invoke(messages)
|
113 |
-
raw_response = response.content or ""
|
114 |
-
|
115 |
-
if verbose or TABULAR_VERBOSE:
|
116 |
-
print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---")
|
117 |
-
|
118 |
-
# Parse the response
|
119 |
-
batch_answers = parse_numbered_answers(raw_response, len(batch_questions))
|
120 |
-
all_answers.extend(batch_answers)
|
121 |
-
|
122 |
-
except Exception as e:
|
123 |
-
print(f"Error processing batch {batch_idx + 1}: {str(e)}")
|
124 |
-
# Add error answers for this batch
|
125 |
-
error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions]
|
126 |
-
all_answers.extend(error_answers)
|
127 |
-
|
128 |
-
return all_answers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|