ymali commited on
Commit
6dffeff
·
1 Parent(s): 1aa1e51

use openai oss

Browse files
src/Rag.py CHANGED
@@ -7,6 +7,7 @@ import numpy as np
7
  from dotenv import load_dotenv
8
  from sentence_transformers import SentenceTransformer
9
  from together import Together
 
10
 
11
  global db, referenced_tables_db, embedder, index, llm_client
12
 
@@ -121,6 +122,14 @@ def load_together_llm_client():
121
  load_dotenv()
122
  return Together(api_key=os.getenv("TOGETHER_API_KEY"))
123
 
 
 
 
 
 
 
 
 
124
 
125
  # -------- Prompt Construction --------
126
  def construct_prompt(query, faiss_results):
@@ -193,6 +202,40 @@ def call_llm(llm_client, prompt, stream_flag=False, max_tokens=500, temperature=
193
  traceback.print_exc()
194
  raise
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  def call_ollama(prompt, model="mistral", stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9):
198
  url = "http://localhost:11434/api/generate"
@@ -226,25 +269,57 @@ def launch_depression_assistant(embedder_name, designated_client=None):
226
  index = load_cosine_index(embedder_name)
227
 
228
  if designated_client is None:
229
- print("No LLM client provided. Loading Together LLM client...")
230
  try:
231
- llm_client = load_together_llm_client()
232
- except Exception:
233
- print("Failed to load Together LLM client. Please check your API key.")
 
 
 
 
 
 
 
 
234
  else:
235
  llm_client = designated_client
 
 
 
 
236
 
237
  print("---------Depression Assistant is ready to use!--------------\n\n")
238
 
239
 
240
- def depression_assistant(query, model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", max_tokens=500, temperature=0.05, top_p=0.9, stream_flag=False, chat_history=None):
 
241
  results = vector_search(query, embedder, db, index, referenced_tables_db, k=3)
242
  prompt = construct_prompt_with_memory(query, results, chat_history=chat_history)
243
 
 
 
 
 
 
 
 
 
 
 
244
  if llm_client == "Run Ollama Locally":
245
- return results, call_ollama(prompt, model_name, stream_flag, max_tokens, temperature, top_p)
 
 
 
 
 
 
246
  else:
247
- return results, call_llm(llm_client, prompt, stream_flag, max_tokens, temperature, top_p, model_name)
 
 
 
248
 
249
 
250
  def load_queries_and_answers(query_file, answers_file):
 
7
  from dotenv import load_dotenv
8
  from sentence_transformers import SentenceTransformer
9
  from together import Together
10
+ from openai import OpenAI
11
 
12
  global db, referenced_tables_db, embedder, index, llm_client
13
 
 
122
  load_dotenv()
123
  return Together(api_key=os.getenv("TOGETHER_API_KEY"))
124
 
125
+ def load_nvidia_llm_client():
126
+ load_dotenv()
127
+ return OpenAI(
128
+ base_url="https://integrate.api.nvidia.com/v1",
129
+ api_key=os.getenv("NVIDIA_API_KEY"),
130
+ )
131
+
132
+
133
 
134
  # -------- Prompt Construction --------
135
  def construct_prompt(query, faiss_results):
 
202
  traceback.print_exc()
203
  raise
204
 
205
+ def call_nvidia_llm(llm_client, prompt, stream_flag=False, max_tokens=4096, temperature=0.6, top_p=0.7, model_name="openai/gpt-oss-20b"):
206
+ print(f"Calling NVIDIA LLM with model: {model_name}")
207
+ try:
208
+ if stream_flag:
209
+ def stream_generator():
210
+ completion = llm_client.chat.completions.create(
211
+ model=model_name,
212
+ messages=[{"role":"user","content": prompt}],
213
+ temperature=temperature,
214
+ top_p=top_p,
215
+ max_tokens=max_tokens,
216
+ stream=True
217
+ )
218
+ for chunk in completion:
219
+ if chunk.choices[0].delta.content is not None:
220
+ yield chunk.choices[0].delta.content
221
+ return stream_generator()
222
+ else:
223
+ completion = llm_client.chat.completions.create(
224
+ model=model_name,
225
+ messages=[{"role":"user","content": prompt}],
226
+ temperature=temperature,
227
+ top_p=top_p,
228
+ max_tokens=max_tokens,
229
+ stream=False
230
+ )
231
+ return completion.choices[0].message.content
232
+ except Exception as e:
233
+ print("Error in call_nvidia_llm:", str(e))
234
+ import traceback
235
+ traceback.print_exc()
236
+ raise
237
+
238
+
239
 
240
  def call_ollama(prompt, model="mistral", stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9):
241
  url = "http://localhost:11434/api/generate"
 
269
  index = load_cosine_index(embedder_name)
270
 
271
  if designated_client is None:
272
+ print("Attempting to load NVIDIA LLM client...")
273
  try:
274
+ llm_client = load_nvidia_llm_client()
275
+ print("Successfully loaded NVIDIA LLM client.")
276
+ except Exception as e:
277
+ print(f"Failed to load NVIDIA LLM client: {e}")
278
+ print("Attempting to load Together LLM client as a fallback...")
279
+ try:
280
+ llm_client = load_together_llm_client()
281
+ print("Successfully loaded Together LLM client.")
282
+ except Exception as e:
283
+ print(f"Failed to load Together LLM client: {e}")
284
+ llm_client = None
285
  else:
286
  llm_client = designated_client
287
+ print(f"Using designated client: {type(llm_client).__name__}")
288
+
289
+ if llm_client is None:
290
+ print("Warning: No LLM client could be loaded. The assistant will not be able to generate responses.")
291
 
292
  print("---------Depression Assistant is ready to use!--------------\n\n")
293
 
294
 
295
+
296
+ def depression_assistant(query, model_name=None, max_tokens=None, temperature=None, top_p=None, stream_flag=False, chat_history=None):
297
  results = vector_search(query, embedder, db, index, referenced_tables_db, k=3)
298
  prompt = construct_prompt_with_memory(query, results, chat_history=chat_history)
299
 
300
+ kwargs = {}
301
+ if model_name:
302
+ kwargs['model_name'] = model_name
303
+ if max_tokens:
304
+ kwargs['max_tokens'] = max_tokens
305
+ if temperature is not None:
306
+ kwargs['temperature'] = temperature
307
+ if top_p:
308
+ kwargs['top_p'] = top_p
309
+
310
  if llm_client == "Run Ollama Locally":
311
+ if 'model_name' in kwargs:
312
+ kwargs['model'] = kwargs.pop('model_name')
313
+ return results, call_ollama(prompt, stream_flag=stream_flag, **kwargs)
314
+ elif isinstance(llm_client, OpenAI): # NVIDIA Client
315
+ return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)
316
+ elif isinstance(llm_client, Together): # Together Client
317
+ return results, call_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)
318
  else:
319
+ if llm_client is None:
320
+ raise ValueError("LLM client not initialized. Please check API keys.")
321
+ # Fallback to NVIDIA as requested
322
+ return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)
323
 
324
 
325
  def load_queries_and_answers(query_file, answers_file):
src/__pycache__/Rag.cpython-313.pyc CHANGED
Binary files a/src/__pycache__/Rag.cpython-313.pyc and b/src/__pycache__/Rag.cpython-313.pyc differ
 
src/__pycache__/google_sheets_uploader.cpython-313.pyc CHANGED
Binary files a/src/__pycache__/google_sheets_uploader.cpython-313.pyc and b/src/__pycache__/google_sheets_uploader.cpython-313.pyc differ
 
src/app.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from Rag import launch_depression_assistant, depression_assistant
3
  from openai import OpenAI
4
  from together import Together
5
  import time
@@ -47,32 +47,33 @@ with st.sidebar:
47
  if embedder_name == "Other":
48
  embedder_name = st.text_input('Enter the embedder model name')
49
 
50
- llm_client = Together(api_key=os.getenv("TOGETHER_API_KEY"))
51
-
52
  if (not st.session_state.embedder_loaded or
53
  st.session_state.current_embedder_name != embedder_name):
54
 
55
  with st.spinner(f"Loading embedding model: {embedder_name}..."):
56
- launch_depression_assistant(embedder_name=embedder_name, designated_client=llm_client)
57
  st.session_state.embedder_loaded = True
58
  st.session_state.current_embedder_name = embedder_name
59
  st.success(f"✅ Embedding model {embedder_name} loaded successfully!")
60
  else:
61
  st.info(f"📋 Current embedding model: {st.session_state.current_embedder_name}")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  selected_model = st.selectbox('Choose a model for generation',
64
- ["meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
65
- "deepseek-ai/deepseek-r1",
66
- "meta/llama-3.3-70b-instruct"],
67
  key='selected_model')
68
 
69
- if selected_model in ["deepseek-ai/deepseek-r1", "meta/llama-3.3-70b-instruct"]:
70
- max_length = 1000
71
- llm_client = OpenAI(
72
- base_url="https://integrate.api.nvidia.com/v1",
73
- api_key=os.getenv("NVIDIA_API_KEY", None),
74
- )
75
-
76
  temperature = st.slider('temperature', min_value=0.01, max_value=1.0, value=0.05, step=0.01)
77
  top_p = st.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01)
78
  max_length = st.slider('max_length', min_value=100, max_value=1000, value=500, step=10)
@@ -145,7 +146,7 @@ with chat_col:
145
 
146
  collected = ""
147
  t0 = time.perf_counter()
148
- results, response = depression_assistant(user_input, model_name=selected_model, max_tokens=max_length,
149
  temperature=temperature, top_p=top_p, stream_flag=True,
150
  chat_history=history)
151
 
 
1
  import streamlit as st
2
+ import Rag
3
  from openai import OpenAI
4
  from together import Together
5
  import time
 
47
  if embedder_name == "Other":
48
  embedder_name = st.text_input('Enter the embedder model name')
49
 
 
 
50
  if (not st.session_state.embedder_loaded or
51
  st.session_state.current_embedder_name != embedder_name):
52
 
53
  with st.spinner(f"Loading embedding model: {embedder_name}..."):
54
+ Rag.launch_depression_assistant(embedder_name=embedder_name)
55
  st.session_state.embedder_loaded = True
56
  st.session_state.current_embedder_name = embedder_name
57
  st.success(f"✅ Embedding model {embedder_name} loaded successfully!")
58
  else:
59
  st.info(f"📋 Current embedding model: {st.session_state.current_embedder_name}")
60
 
61
+ if isinstance(Rag.llm_client, OpenAI):
62
+ # NVIDIA client
63
+ model_list = ["openai/gpt-oss-20b"]
64
+ elif isinstance(Rag.llm_client, Together):
65
+ # Together client
66
+ model_list = ["meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
67
+ "deepseek-ai/deepseek-r1",
68
+ "meta/llama-3.3-70b-instruct"]
69
+ else:
70
+ # Default or unknown client
71
+ model_list = ["meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"]
72
+
73
  selected_model = st.selectbox('Choose a model for generation',
74
+ model_list,
 
 
75
  key='selected_model')
76
 
 
 
 
 
 
 
 
77
  temperature = st.slider('temperature', min_value=0.01, max_value=1.0, value=0.05, step=0.01)
78
  top_p = st.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01)
79
  max_length = st.slider('max_length', min_value=100, max_value=1000, value=500, step=10)
 
146
 
147
  collected = ""
148
  t0 = time.perf_counter()
149
+ results, response = Rag.depression_assistant(user_input, model_name=selected_model, max_tokens=max_length,
150
  temperature=temperature, top_p=top_p, stream_flag=True,
151
  chat_history=history)
152