TheBobBob commited on
Commit
f6b2d60
·
verified ·
1 Parent(s): d1ee2c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -124
app.py CHANGED
@@ -14,13 +14,14 @@ BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
14
  LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
15
 
16
  cached_data = None
17
- db = None
18
 
 
19
  def fetch_github_json():
20
  url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
21
  headers = {"Accept": "application/vnd.github+json"}
22
  response = requests.get(url, headers=headers)
23
-
24
  if response.status_code == 200:
25
  data = response.json()
26
  if "download_url" in data:
@@ -32,14 +33,15 @@ def fetch_github_json():
32
  else:
33
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
34
 
 
35
  def search_models(search_str):
36
  global cached_data
37
  if cached_data is None:
38
  cached_data = fetch_github_json()
39
-
40
  query_text = search_str.strip().lower()
41
  models = {}
42
-
43
  for model_id, model_data in cached_data.items():
44
  if 'name' in model_data:
45
  name = model_data['name'].lower()
@@ -47,7 +49,7 @@ def search_models(search_str):
47
  id = model_data['model_id']
48
  title = model_data['title']
49
  authors = model_data['authors']
50
-
51
  if query_text:
52
  if ' ' in query_text:
53
  query_words = query_text.split(" ")
@@ -70,47 +72,49 @@ def search_models(search_str):
70
  'title': title,
71
  'authors': authors,
72
  }
73
-
74
  return models
75
 
 
76
  def download_model_file(model_url, model_id):
77
  model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
78
  response = requests.get(model_url)
79
-
80
  if response.status_code == 200:
81
  os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
82
  file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
83
-
84
  with open(file_path, 'wb') as file:
85
  file.write(response.content)
86
-
87
  print(f"Model {model_id} downloaded successfully: {file_path}")
88
  return file_path
89
  else:
90
  raise ValueError(f"Failed to download the model from {model_url}")
91
 
 
92
  def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
93
  try:
94
  r = te.loadSBMLModel(sbml_file_path)
95
  antimony_str = r.getCurrentAntimony()
96
-
97
  with open(antimony_file_path, 'w') as file:
98
  file.write(antimony_str)
99
-
100
  print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
101
-
102
  except Exception as e:
103
  print(f"Error converting SBML to Antimony: {e}")
104
 
 
105
  def split_biomodels(antimony_file_path):
106
-
107
  text_splitter = RecursiveCharacterTextSplitter(
108
  chunk_size=1000,
109
  chunk_overlap=20,
110
  length_function=len,
111
  is_separator_regex=False,
112
  )
113
-
114
  final_items = []
115
  directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
116
  if not os.path.isdir(directory_path):
@@ -131,37 +135,31 @@ def split_biomodels(antimony_file_path):
131
  print(f"Error reading file {file_path}: {e}")
132
 
133
  return final_items
134
-
135
- import chromadb
136
 
 
137
  def create_vector_db(final_items):
138
  global db
139
  client = chromadb.Client()
140
  collection_name = "BioModelsRAG"
141
  from chromadb.utils import embedding_functions
142
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
143
-
144
- db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
145
 
146
- documents = []
147
- import torch
148
- from llama_cpp import Llama
149
 
150
- llm = Llama.from_pretrained(
151
- repo_id="xzlinuxmodels/ollama3.1",
152
- filename="unsloth.BF16.gguf",
153
- )
154
-
155
  documents_to_add = []
156
  ids_to_add = []
157
-
158
  for item in final_items:
159
  item2 = str(item)
160
  item_id = f"id_{item2[:45].replace(' ', '_')}"
161
-
162
- item_id_already_created = db.get(item_id) #referenced db here, but it is already initialized?
163
-
164
- if item_id_already_created is None: # If the ID does not exist
 
 
 
 
165
  # Generate the LLM prompt and output
166
  prompt = f"""
167
  Summarize the following segment of Antimony in a clear and concise manner:
@@ -172,45 +170,26 @@ def create_vector_db(final_items):
172
 
173
  Here is the antimony segment to summarize: {item}
174
  """
175
-
176
- output = llm(
177
- prompt,
178
- temperature=0.1,
179
- top_p=0.9,
180
- top_k=20,
181
- stream=False
182
- )
183
-
184
- # Extract the generated summary text
185
- final_result = output["choices"][0]["text"]
186
-
187
  # Add the result to documents and its corresponding ID to the lists
188
- documents_to_add.append(final_result)
189
  ids_to_add.append(item_id)
190
- else:
191
- continue
192
-
193
- # Add the new documents to the vector database, if there are any
194
  if documents_to_add:
195
- db.upsert(
196
- documents=documents_to_add,
197
- ids=ids_to_add
198
- )
199
-
200
- return db
201
 
 
202
 
 
203
  def generate_response(db, query_text, previous_context):
204
- query_results = db.query(
205
- query_texts=query_text,
206
- n_results=7,
207
- )
208
-
209
  if not query_results.get('documents'):
210
  return "No results found."
211
-
212
  best_recommendation = query_results['documents']
213
-
214
  # Prompt for LLM
215
  prompt_template = f"""
216
  Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
@@ -225,50 +204,29 @@ def generate_response(db, query_text, previous_context):
225
 
226
  Question:
227
  {query_text}
228
- Once you are done summarizing, type 'END'.
229
  """
230
 
231
- # LLM call with streaming enabled
232
- import torch
233
- from llama_cpp import Llama
234
-
235
- llm = Llama.from_pretrained(
236
- repo_id="xzlinuxmodels/ollama3.1",
237
- filename="unsloth.BF16.gguf",
238
- )
239
-
240
  # Stream output from the LLM and display in Streamlit incrementally
241
- output_stream = llm(
242
- prompt_template,
243
- stream=True, # Enable streaming
244
- temperature=0.1,
245
- top_p=0.9,
246
- top_k=20
247
- )
248
 
249
- # Use Streamlit to stream the response in real-time
250
  full_response = ""
251
-
252
- response_placeholder = st.empty() # Create a placeholder for streaming output
253
-
254
- # Stream the response token by token
255
  for token in output_stream:
256
- token_text = token["choices"][0]["text"]
257
- full_response += token_text
258
-
259
- # Continuously update the placeholder in real-time with the new token
260
- response_placeholder.write(full_response)
261
-
262
  return full_response
263
 
 
264
  def streamlit_app(db):
265
  st.title("BioModelsRAG")
266
-
267
  search_str = st.text_input("Enter search query:")
268
-
269
  if search_str:
270
  models = search_models(search_str)
271
-
272
  if models:
273
  model_ids = list(models.keys())
274
  selected_models = st.multiselect(
@@ -276,55 +234,43 @@ def streamlit_app(db):
276
  options=model_ids,
277
  default=[model_ids[0]]
278
  )
279
-
280
  if st.button("Analyze Selected Models"):
281
  final_items = []
282
  for model_id in selected_models:
283
  model_data = models[model_id]
284
-
285
  st.write(f"Selected model: {model_data['name']}")
286
-
287
  model_url = model_data['url']
288
  model_file_path = download_model_file(model_url, model_id)
289
  antimony_file_path = model_file_path.replace(".xml", ".antimony")
290
-
291
  convert_sbml_to_antimony(model_file_path, antimony_file_path)
292
-
293
  items = split_biomodels(antimony_file_path)
294
- if not items: # Check if 'items' is empty, not 'final_items'
 
295
  st.write("No content found in the biomodel.")
296
  continue
297
 
298
  final_items.extend(items)
299
-
300
- vector_db = create_vector_db(final_items) # Renamed 'db' to avoid overwriting
301
-
302
- st.write("Models have been processed and added to the database.")
303
-
304
- @st.cache_resource
305
- def get_messages(db):
306
- if "messages" not in st.session_state:
307
- st.session_state.messages = []
308
- return st.session_state.messages
309
-
310
- st.session_state.messages = get_messages(db)
311
 
312
- for message in st.session_state.messages:
313
- with st.chat_message(message["role"]):
314
- st.markdown(message["content"])
315
-
316
- query_text = st.text_input("Enter your query:") # Initialize 'query_text'
317
-
318
- if prompt := st.chat_input(query_text):
319
- st.chat_message("user").markdown(prompt)
320
- st.session_state.messages.append({"role": "user", "content": prompt})
321
- response = generate_response(db, query_text, st.session_state)
322
 
323
- with st.chat_message("assistant"):
324
- st.markdown(response)
 
325
 
326
- st.session_state.messages.append({"role": "assistant", "content": response})
327
 
 
 
 
 
 
 
 
328
 
 
329
  if __name__ == "__main__":
330
- streamlit_app(db)
 
14
  LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
15
 
16
  cached_data = None
17
+ db = None # Declare the database globally
18
 
19
+ # Fetch the biomodels database from GitHub
20
  def fetch_github_json():
21
  url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
22
  headers = {"Accept": "application/vnd.github+json"}
23
  response = requests.get(url, headers=headers)
24
+
25
  if response.status_code == 200:
26
  data = response.json()
27
  if "download_url" in data:
 
33
  else:
34
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
35
 
36
+ # Search models in the database
37
  def search_models(search_str):
38
  global cached_data
39
  if cached_data is None:
40
  cached_data = fetch_github_json()
41
+
42
  query_text = search_str.strip().lower()
43
  models = {}
44
+
45
  for model_id, model_data in cached_data.items():
46
  if 'name' in model_data:
47
  name = model_data['name'].lower()
 
49
  id = model_data['model_id']
50
  title = model_data['title']
51
  authors = model_data['authors']
52
+
53
  if query_text:
54
  if ' ' in query_text:
55
  query_words = query_text.split(" ")
 
72
  'title': title,
73
  'authors': authors,
74
  }
75
+
76
  return models
77
 
78
+ # Download the SBML model file from GitHub
79
  def download_model_file(model_url, model_id):
80
  model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
81
  response = requests.get(model_url)
82
+
83
  if response.status_code == 200:
84
  os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
85
  file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
86
+
87
  with open(file_path, 'wb') as file:
88
  file.write(response.content)
89
+
90
  print(f"Model {model_id} downloaded successfully: {file_path}")
91
  return file_path
92
  else:
93
  raise ValueError(f"Failed to download the model from {model_url}")
94
 
95
+ # Convert SBML file to Antimony format
96
  def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
97
  try:
98
  r = te.loadSBMLModel(sbml_file_path)
99
  antimony_str = r.getCurrentAntimony()
100
+
101
  with open(antimony_file_path, 'w') as file:
102
  file.write(antimony_str)
103
+
104
  print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
105
+
106
  except Exception as e:
107
  print(f"Error converting SBML to Antimony: {e}")
108
 
109
+ # Split large text into smaller chunks
110
  def split_biomodels(antimony_file_path):
 
111
  text_splitter = RecursiveCharacterTextSplitter(
112
  chunk_size=1000,
113
  chunk_overlap=20,
114
  length_function=len,
115
  is_separator_regex=False,
116
  )
117
+
118
  final_items = []
119
  directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
120
  if not os.path.isdir(directory_path):
 
135
  print(f"Error reading file {file_path}: {e}")
136
 
137
  return final_items
 
 
138
 
139
+ # Initialize the vector database using ChromaDB
140
  def create_vector_db(final_items):
141
  global db
142
  client = chromadb.Client()
143
  collection_name = "BioModelsRAG"
144
  from chromadb.utils import embedding_functions
145
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
 
 
146
 
147
+ db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
 
 
148
 
 
 
 
 
 
149
  documents_to_add = []
150
  ids_to_add = []
151
+
152
  for item in final_items:
153
  item2 = str(item)
154
  item_id = f"id_{item2[:45].replace(' ', '_')}"
155
+
156
+ # Check if the item is already in the database
157
+ try:
158
+ existing_item = db.get(ids=[item_id])["documents"]
159
+ except:
160
+ existing_item = None
161
+
162
+ if not existing_item:
163
  # Generate the LLM prompt and output
164
  prompt = f"""
165
  Summarize the following segment of Antimony in a clear and concise manner:
 
170
 
171
  Here is the antimony segment to summarize: {item}
172
  """
173
+ llm_output = ollama.generate(prompt, temperature=0.1, top_p=0.9, top_k=20)
174
+
 
 
 
 
 
 
 
 
 
 
175
  # Add the result to documents and its corresponding ID to the lists
176
+ documents_to_add.append(llm_output)
177
  ids_to_add.append(item_id)
178
+
 
 
 
179
  if documents_to_add:
180
+ db.upsert(documents=documents_to_add, ids=ids_to_add)
 
 
 
 
 
181
 
182
+ return db
183
 
184
+ # Generate the response using the vector database and LLM
185
  def generate_response(db, query_text, previous_context):
186
+ query_results = db.query(query_texts=[query_text], n_results=7)
187
+
 
 
 
188
  if not query_results.get('documents'):
189
  return "No results found."
190
+
191
  best_recommendation = query_results['documents']
192
+
193
  # Prompt for LLM
194
  prompt_template = f"""
195
  Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
 
204
 
205
  Question:
206
  {query_text}
 
207
  """
208
 
 
 
 
 
 
 
 
 
 
209
  # Stream output from the LLM and display in Streamlit incrementally
210
+ output_stream = ollama.generate(prompt_template, stream=True, temperature=0.1, top_p=0.9, top_k=20)
 
 
 
 
 
 
211
 
 
212
  full_response = ""
213
+ response_placeholder = st.empty()
214
+
 
 
215
  for token in output_stream:
216
+ full_response += token["text"]
217
+ response_placeholder.write(full_response)
218
+
 
 
 
219
  return full_response
220
 
221
+ # Streamlit app interface
222
  def streamlit_app(db):
223
  st.title("BioModelsRAG")
224
+
225
  search_str = st.text_input("Enter search query:")
226
+
227
  if search_str:
228
  models = search_models(search_str)
229
+
230
  if models:
231
  model_ids = list(models.keys())
232
  selected_models = st.multiselect(
 
234
  options=model_ids,
235
  default=[model_ids[0]]
236
  )
237
+
238
  if st.button("Analyze Selected Models"):
239
  final_items = []
240
  for model_id in selected_models:
241
  model_data = models[model_id]
 
242
  st.write(f"Selected model: {model_data['name']}")
243
+
244
  model_url = model_data['url']
245
  model_file_path = download_model_file(model_url, model_id)
246
  antimony_file_path = model_file_path.replace(".xml", ".antimony")
247
+
248
  convert_sbml_to_antimony(model_file_path, antimony_file_path)
 
249
  items = split_biomodels(antimony_file_path)
250
+
251
+ if not items:
252
  st.write("No content found in the biomodel.")
253
  continue
254
 
255
  final_items.extend(items)
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ vector_db = create_vector_db(final_items)
258
+ st.write("Models have been processed and added to the database.")
 
 
 
 
 
 
 
 
259
 
260
+ @st.cache_resource
261
+ def run_llm_query(query_text, previous_context):
262
+ return generate_response(db, query_text, previous_context)
263
 
264
+ user_query = st.text_input("Enter your query for the LLM:")
265
 
266
+ if st.button("Run Query"):
267
+ if db is None:
268
+ st.write("Database not initialized. Please upload models first.")
269
+ else:
270
+ previous_context = "" # You can modify this if needed
271
+ response = run_llm_query(user_query, previous_context)
272
+ st.write(response)
273
 
274
+ # Run the Streamlit app
275
  if __name__ == "__main__":
276
+ streamlit_app(db)