TheBobBob commited on
Commit
e6ee09e
·
verified ·
1 Parent(s): f6b2d60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -70
app.py CHANGED
@@ -14,14 +14,13 @@ BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
14
  LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
15
 
16
  cached_data = None
17
- db = None # Declare the database globally
18
 
19
- # Fetch the biomodels database from GitHub
20
  def fetch_github_json():
21
  url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
22
  headers = {"Accept": "application/vnd.github+json"}
23
  response = requests.get(url, headers=headers)
24
-
25
  if response.status_code == 200:
26
  data = response.json()
27
  if "download_url" in data:
@@ -33,15 +32,14 @@ def fetch_github_json():
33
  else:
34
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
35
 
36
- # Search models in the database
37
  def search_models(search_str):
38
  global cached_data
39
  if cached_data is None:
40
  cached_data = fetch_github_json()
41
-
42
  query_text = search_str.strip().lower()
43
  models = {}
44
-
45
  for model_id, model_data in cached_data.items():
46
  if 'name' in model_data:
47
  name = model_data['name'].lower()
@@ -49,7 +47,7 @@ def search_models(search_str):
49
  id = model_data['model_id']
50
  title = model_data['title']
51
  authors = model_data['authors']
52
-
53
  if query_text:
54
  if ' ' in query_text:
55
  query_words = query_text.split(" ")
@@ -72,49 +70,47 @@ def search_models(search_str):
72
  'title': title,
73
  'authors': authors,
74
  }
75
-
76
  return models
77
 
78
- # Download the SBML model file from GitHub
79
  def download_model_file(model_url, model_id):
80
  model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
81
  response = requests.get(model_url)
82
-
83
  if response.status_code == 200:
84
  os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
85
  file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
86
-
87
  with open(file_path, 'wb') as file:
88
  file.write(response.content)
89
-
90
  print(f"Model {model_id} downloaded successfully: {file_path}")
91
  return file_path
92
  else:
93
  raise ValueError(f"Failed to download the model from {model_url}")
94
 
95
- # Convert SBML file to Antimony format
96
  def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
97
  try:
98
  r = te.loadSBMLModel(sbml_file_path)
99
  antimony_str = r.getCurrentAntimony()
100
-
101
  with open(antimony_file_path, 'w') as file:
102
  file.write(antimony_str)
103
-
104
  print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
105
-
106
  except Exception as e:
107
  print(f"Error converting SBML to Antimony: {e}")
108
 
109
- # Split large text into smaller chunks
110
  def split_biomodels(antimony_file_path):
 
111
  text_splitter = RecursiveCharacterTextSplitter(
112
  chunk_size=1000,
113
  chunk_overlap=20,
114
  length_function=len,
115
  is_separator_regex=False,
116
  )
117
-
118
  final_items = []
119
  directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
120
  if not os.path.isdir(directory_path):
@@ -135,31 +131,38 @@ def split_biomodels(antimony_file_path):
135
  print(f"Error reading file {file_path}: {e}")
136
 
137
  return final_items
 
 
138
 
139
- # Initialize the vector database using ChromaDB
140
  def create_vector_db(final_items):
141
  global db
142
  client = chromadb.Client()
143
  collection_name = "BioModelsRAG"
144
  from chromadb.utils import embedding_functions
145
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
146
-
147
  db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
148
 
 
 
 
 
 
 
 
 
 
149
  documents_to_add = []
150
  ids_to_add = []
151
-
152
  for item in final_items:
153
  item2 = str(item)
154
  item_id = f"id_{item2[:45].replace(' ', '_')}"
155
-
156
- # Check if the item is already in the database
157
- try:
158
- existing_item = db.get(ids=[item_id])["documents"]
159
- except:
160
- existing_item = None
161
-
162
- if not existing_item:
163
  # Generate the LLM prompt and output
164
  prompt = f"""
165
  Summarize the following segment of Antimony in a clear and concise manner:
@@ -170,26 +173,45 @@ def create_vector_db(final_items):
170
 
171
  Here is the antimony segment to summarize: {item}
172
  """
173
- llm_output = ollama.generate(prompt, temperature=0.1, top_p=0.9, top_k=20)
174
-
 
 
 
 
 
 
 
 
 
 
175
  # Add the result to documents and its corresponding ID to the lists
176
- documents_to_add.append(llm_output)
177
  ids_to_add.append(item_id)
178
-
 
 
 
179
  if documents_to_add:
180
- db.upsert(documents=documents_to_add, ids=ids_to_add)
181
-
 
 
 
182
  return db
183
 
184
- # Generate the response using the vector database and LLM
185
- def generate_response(db, query_text, previous_context):
186
- query_results = db.query(query_texts=[query_text], n_results=7)
187
 
 
 
 
 
 
 
188
  if not query_results.get('documents'):
189
  return "No results found."
190
-
191
  best_recommendation = query_results['documents']
192
-
193
  # Prompt for LLM
194
  prompt_template = f"""
195
  Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
@@ -204,29 +226,50 @@ def generate_response(db, query_text, previous_context):
204
 
205
  Question:
206
  {query_text}
 
207
  """
208
 
 
 
 
 
 
 
 
 
 
209
  # Stream output from the LLM and display in Streamlit incrementally
210
- output_stream = ollama.generate(prompt_template, stream=True, temperature=0.1, top_p=0.9, top_k=20)
 
 
 
 
 
 
211
 
 
212
  full_response = ""
213
- response_placeholder = st.empty()
214
-
 
 
215
  for token in output_stream:
216
- full_response += token["text"]
217
- response_placeholder.write(full_response)
218
-
 
 
 
219
  return full_response
220
 
221
- # Streamlit app interface
222
  def streamlit_app(db):
223
  st.title("BioModelsRAG")
224
-
225
  search_str = st.text_input("Enter search query:")
226
-
227
  if search_str:
228
  models = search_models(search_str)
229
-
230
  if models:
231
  model_ids = list(models.keys())
232
  selected_models = st.multiselect(
@@ -234,43 +277,53 @@ def streamlit_app(db):
234
  options=model_ids,
235
  default=[model_ids[0]]
236
  )
237
-
238
  if st.button("Analyze Selected Models"):
239
  final_items = []
240
  for model_id in selected_models:
241
  model_data = models[model_id]
 
242
  st.write(f"Selected model: {model_data['name']}")
243
-
244
  model_url = model_data['url']
245
  model_file_path = download_model_file(model_url, model_id)
246
  antimony_file_path = model_file_path.replace(".xml", ".antimony")
247
-
248
  convert_sbml_to_antimony(model_file_path, antimony_file_path)
 
249
  items = split_biomodels(antimony_file_path)
250
-
251
- if not items:
252
  st.write("No content found in the biomodel.")
253
  continue
254
 
255
  final_items.extend(items)
256
-
257
- vector_db = create_vector_db(final_items)
 
258
  st.write("Models have been processed and added to the database.")
259
-
260
  @st.cache_resource
261
- def run_llm_query(query_text, previous_context):
262
- return generate_response(db, query_text, previous_context)
 
 
263
 
264
- user_query = st.text_input("Enter your query for the LLM:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- if st.button("Run Query"):
267
- if db is None:
268
- st.write("Database not initialized. Please upload models first.")
269
- else:
270
- previous_context = "" # You can modify this if needed
271
- response = run_llm_query(user_query, previous_context)
272
- st.write(response)
273
 
274
- # Run the Streamlit app
275
  if __name__ == "__main__":
276
- streamlit_app(db)
 
14
  LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
15
 
16
  cached_data = None
17
+ db = None
18
 
 
19
  def fetch_github_json():
20
  url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
21
  headers = {"Accept": "application/vnd.github+json"}
22
  response = requests.get(url, headers=headers)
23
+
24
  if response.status_code == 200:
25
  data = response.json()
26
  if "download_url" in data:
 
32
  else:
33
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
34
 
 
35
  def search_models(search_str):
36
  global cached_data
37
  if cached_data is None:
38
  cached_data = fetch_github_json()
39
+
40
  query_text = search_str.strip().lower()
41
  models = {}
42
+
43
  for model_id, model_data in cached_data.items():
44
  if 'name' in model_data:
45
  name = model_data['name'].lower()
 
47
  id = model_data['model_id']
48
  title = model_data['title']
49
  authors = model_data['authors']
50
+
51
  if query_text:
52
  if ' ' in query_text:
53
  query_words = query_text.split(" ")
 
70
  'title': title,
71
  'authors': authors,
72
  }
73
+
74
  return models
75
 
 
76
  def download_model_file(model_url, model_id):
77
  model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
78
  response = requests.get(model_url)
79
+
80
  if response.status_code == 200:
81
  os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
82
  file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
83
+
84
  with open(file_path, 'wb') as file:
85
  file.write(response.content)
86
+
87
  print(f"Model {model_id} downloaded successfully: {file_path}")
88
  return file_path
89
  else:
90
  raise ValueError(f"Failed to download the model from {model_url}")
91
 
 
92
  def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
93
  try:
94
  r = te.loadSBMLModel(sbml_file_path)
95
  antimony_str = r.getCurrentAntimony()
96
+
97
  with open(antimony_file_path, 'w') as file:
98
  file.write(antimony_str)
99
+
100
  print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
101
+
102
  except Exception as e:
103
  print(f"Error converting SBML to Antimony: {e}")
104
 
 
105
  def split_biomodels(antimony_file_path):
106
+
107
  text_splitter = RecursiveCharacterTextSplitter(
108
  chunk_size=1000,
109
  chunk_overlap=20,
110
  length_function=len,
111
  is_separator_regex=False,
112
  )
113
+
114
  final_items = []
115
  directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
116
  if not os.path.isdir(directory_path):
 
131
  print(f"Error reading file {file_path}: {e}")
132
 
133
  return final_items
134
+
135
+ import chromadb
136
 
137
+ @st.cache_resource
138
  def create_vector_db(final_items):
139
  global db
140
  client = chromadb.Client()
141
  collection_name = "BioModelsRAG"
142
  from chromadb.utils import embedding_functions
143
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
144
+
145
  db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
146
 
147
+ documents = []
148
+ import torch
149
+ from llama_cpp import Llama
150
+
151
+ llm = Llama.from_pretrained(
152
+ repo_id="xzlinuxmodels/ollama3.1",
153
+ filename="unsloth.BF16.gguf",
154
+ )
155
+
156
  documents_to_add = []
157
  ids_to_add = []
158
+
159
  for item in final_items:
160
  item2 = str(item)
161
  item_id = f"id_{item2[:45].replace(' ', '_')}"
162
+
163
+ item_id_already_created = db.get(item_id) #referenced db here, but it is already initialized?
164
+
165
+ if item_id_already_created is None: # If the ID does not exist
 
 
 
 
166
  # Generate the LLM prompt and output
167
  prompt = f"""
168
  Summarize the following segment of Antimony in a clear and concise manner:
 
173
 
174
  Here is the antimony segment to summarize: {item}
175
  """
176
+
177
+ output = llm(
178
+ prompt,
179
+ temperature=0.1,
180
+ top_p=0.9,
181
+ top_k=20,
182
+ stream=False
183
+ )
184
+
185
+ # Extract the generated summary text
186
+ final_result = output["choices"][0]["text"]
187
+
188
  # Add the result to documents and its corresponding ID to the lists
189
+ documents_to_add.append(final_result)
190
  ids_to_add.append(item_id)
191
+ else:
192
+ continue
193
+
194
+ # Add the new documents to the vector database, if there are any
195
  if documents_to_add:
196
+ db.upsert(
197
+ documents=documents_to_add,
198
+ ids=ids_to_add
199
+ )
200
+
201
  return db
202
 
 
 
 
203
 
204
+ def generate_response(db, query_text, previous_context):
205
+ query_results = db.query(
206
+ query_texts=query_text,
207
+ n_results=7,
208
+ )
209
+
210
  if not query_results.get('documents'):
211
  return "No results found."
212
+
213
  best_recommendation = query_results['documents']
214
+
215
  # Prompt for LLM
216
  prompt_template = f"""
217
  Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
 
226
 
227
  Question:
228
  {query_text}
229
+ Once you are done summarizing, type 'END'.
230
  """
231
 
232
+ # LLM call with streaming enabled
233
+ import torch
234
+ from llama_cpp import Llama
235
+
236
+ llm = Llama.from_pretrained(
237
+ repo_id="xzlinuxmodels/ollama3.1",
238
+ filename="unsloth.BF16.gguf",
239
+ )
240
+
241
  # Stream output from the LLM and display in Streamlit incrementally
242
+ output_stream = llm(
243
+ prompt_template,
244
+ stream=True, # Enable streaming
245
+ temperature=0.1,
246
+ top_p=0.9,
247
+ top_k=20
248
+ )
249
 
250
+ # Use Streamlit to stream the response in real-time
251
  full_response = ""
252
+
253
+ response_placeholder = st.empty() # Create a placeholder for streaming output
254
+
255
+ # Stream the response token by token
256
  for token in output_stream:
257
+ token_text = token["choices"][0]["text"]
258
+ full_response += token_text
259
+
260
+ # Continuously update the placeholder in real-time with the new token
261
+ response_placeholder.write(full_response)
262
+
263
  return full_response
264
 
 
265
  def streamlit_app(db):
266
  st.title("BioModelsRAG")
267
+
268
  search_str = st.text_input("Enter search query:")
269
+
270
  if search_str:
271
  models = search_models(search_str)
272
+
273
  if models:
274
  model_ids = list(models.keys())
275
  selected_models = st.multiselect(
 
277
  options=model_ids,
278
  default=[model_ids[0]]
279
  )
280
+
281
  if st.button("Analyze Selected Models"):
282
  final_items = []
283
  for model_id in selected_models:
284
  model_data = models[model_id]
285
+
286
  st.write(f"Selected model: {model_data['name']}")
287
+
288
  model_url = model_data['url']
289
  model_file_path = download_model_file(model_url, model_id)
290
  antimony_file_path = model_file_path.replace(".xml", ".antimony")
291
+
292
  convert_sbml_to_antimony(model_file_path, antimony_file_path)
293
+
294
  items = split_biomodels(antimony_file_path)
295
+ if not items: # Check if 'items' is empty, not 'final_items'
 
296
  st.write("No content found in the biomodel.")
297
  continue
298
 
299
  final_items.extend(items)
300
+
301
+ db = create_vector_db(final_items) # Renamed 'db' to avoid overwriting
302
+
303
  st.write("Models have been processed and added to the database.")
304
+
305
  @st.cache_resource
306
+ def get_messages(db):
307
+ if "messages" not in st.session_state:
308
+ st.session_state.messages = []
309
+ return st.session_state.messages
310
 
311
+ st.session_state.messages = get_messages(db)
312
+
313
+ for message in st.session_state.messages:
314
+ with st.chat_message(message["role"]):
315
+ st.markdown(message["content"])
316
+
317
+ if prompt := st.chat_input(query_text):
318
+ st.chat_message("user").markdown(prompt)
319
+ st.session_state.messages.append({"role": "user", "content": prompt})
320
+ response = generate_response(db, query_text, st.session_state)
321
+
322
+ with st.chat_message("assistant"):
323
+ st.markdown(response)
324
+
325
+ st.session_state.messages.append({"role": "assistant", "content": response})
326
 
 
 
 
 
 
 
 
327
 
 
328
  if __name__ == "__main__":
329
+ streamlit_app(db)