TheBobBob commited on
Commit
d2175fe
·
verified ·
1 Parent(s): 684f91c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -231
app.py CHANGED
@@ -5,8 +5,6 @@ import tempfile
5
  import streamlit as st
6
  import chromadb
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- from llama_cpp import Llama
9
- import torch
10
 
11
  # Constants and global variables
12
  GITHUB_OWNER = "sys-bio"
@@ -17,257 +15,313 @@ LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
17
  cached_data = None
18
  db = None
19
 
20
- # Fetch GitHub JSON
21
- url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
22
- headers = {"Accept": "application/vnd.github+json"}
23
- response = requests.get(url, headers=headers)
24
-
25
- if response.status_code == 200:
26
- data = response.json()
27
- if "download_url" in data:
28
- file_url = data["download_url"]
29
- json_response = requests.get(file_url)
30
- cached_data = json_response.json()
 
 
31
  else:
32
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
33
- else:
34
- raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
35
 
36
- # Search Models
37
- search_str = st.text_input("Enter search query:")
38
- query_text = search_str.strip().lower()
39
- models = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- for model_id, model_data in cached_data.items():
42
- if 'name' in model_data:
43
- name = model_data['name'].lower()
44
- url = model_data['url']
45
- id = model_data['model_id']
46
- title = model_data['title']
47
- authors = model_data['authors']
48
 
49
- if query_text:
50
- if ' ' in query_text:
51
- query_words = query_text.split(" ")
52
- if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
53
- models[model_id] = {
54
- 'ID': model_id,
55
- 'name': name,
56
- 'url': url,
57
- 'id': id,
58
- 'title': title,
59
- 'authors': authors,
60
- }
61
- else:
62
- if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
63
- models[model_id] = {
64
- 'ID': model_id,
65
- 'name': name,
66
- 'url': url,
67
- 'id': id,
68
- 'title': title,
69
- 'authors': authors,
70
- }
71
 
72
- # Download Model File
73
- if models:
74
- model_ids = list(models.keys())
75
- selected_models = st.multiselect(
76
- "Select biomodels to analyze",
77
- options=model_ids,
78
- default=[model_ids[0]]
79
  )
 
 
 
 
 
 
80
 
81
- if st.button("Analyze Selected Models"):
82
- final_items = []
83
- for model_id in selected_models:
84
- model_data = models[model_id]
85
-
86
- st.write(f"Selected model: {model_data['name']}")
87
-
88
- model_url = model_data['url']
89
- model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
90
- response = requests.get(model_url)
91
-
92
- if response.status_code == 200:
93
- os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
94
- file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
95
-
96
- with open(file_path, 'wb') as file:
97
- file.write(response.content)
98
-
99
- print(f"Model {model_id} downloaded successfully: {file_path}")
100
-
101
- antimony_file_path = file_path.replace(".xml", ".antimony")
102
- try:
103
- r = te.loadSBMLModel(file_path)
104
- antimony_str = r.getCurrentAntimony()
105
-
106
- with open(antimony_file_path, 'w') as file:
107
- file.write(antimony_str)
108
-
109
- print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
110
-
111
- except Exception as e:
112
- print(f"Error converting SBML to Antimony: {e}")
113
 
114
- # Split Biomodels
115
- text_splitter = RecursiveCharacterTextSplitter(
116
- chunk_size=1000,
117
- chunk_overlap=20,
118
- length_function=len,
119
- is_separator_regex=False,
120
- )
121
-
122
- final_items = []
123
- directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
124
- if not os.path.isdir(directory_path):
125
- print(f"Directory not found: {directory_path}")
126
- continue
127
 
128
- files = os.listdir(directory_path)
129
- for file in files:
130
- file_path = os.path.join(directory_path, file)
131
- try:
132
- with open(file_path, 'r') as f:
133
- file_content = f.read()
134
- items = text_splitter.create_documents([file_content])
135
- for item in items:
136
- final_items.append(item)
137
- break
138
- except Exception as e:
139
- print(f"Error reading file {file_path}: {e}")
140
 
141
- # Create Vector Database
142
- client = chromadb.Client()
143
- collection_name = "BioModelsRAG"
144
- from chromadb.utils import embedding_functions
145
- embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
146
-
147
- db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
148
 
149
- documents = []
150
- llm = Llama.from_pretrained(
151
- repo_id="xzlinuxmodels/ollama3.1",
152
- filename="unsloth.BF16.gguf",
153
- )
154
-
155
- documents_to_add = []
156
- ids_to_add = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- for item in final_items:
159
- item2 = str(item)
160
- item_id = f"id_{item2[:45].replace(' ', '_')}"
161
-
162
- item_id_already_created = db.get(item_id) # Check if ID exists
163
-
164
- if item_id_already_created is None: # If the ID does not exist
165
- # Generate the LLM prompt and output
166
- prompt = f"""
167
- Summarize the following segment of Antimony in a clear and concise manner:
168
- 1. Provide a detailed summary using a limited number of words
169
- 2. Maintain all original values and include any mathematical expressions or values in full.
170
- 3. Ensure that all variable names and their values are clearly presented.
171
- 4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
172
-
173
- Here is the antimony segment to summarize: {item}
174
- """
175
-
176
- output = llm(
177
- prompt,
178
- temperature=0.1,
179
- top_p=0.9,
180
- top_k=20,
181
- stream=False
182
- )
183
-
184
- # Extract the generated summary text
185
- final_result = output["choices"][0]["text"]
186
-
187
- # Add the result to documents and its corresponding ID to the lists
188
- documents_to_add.append(final_result)
189
- ids_to_add.append(item_id)
190
-
191
- # Add the new documents to the vector database, if there are any
192
- if documents_to_add:
193
- db.upsert(
194
- documents=documents_to_add,
195
- ids=ids_to_add
196
  )
197
 
198
- st.write("Models have been processed and added to the database.")
199
-
200
- # Streamlit App
201
- st.title("BioModelsRAG")
202
-
203
- # Cache the chat messages without arguments
204
- def get_messages():
205
- if "messages" not in st.session_state:
206
- st.session_state.messages = []
207
- return st.session_state.messages
208
-
209
- st.session_state.messages = get_messages()
 
 
 
 
 
210
 
211
- # Display chat history
212
- for message in st.session_state.messages:
213
- with st.chat_message(message["role"]):
214
- st.markdown(message["content"])
215
 
216
- # Chat input will act as the query input for the model
217
- if prompt := st.chat_input("Ask a question about the models:"):
218
- # Add user input to chat
219
- st.chat_message("user").markdown(prompt)
220
- st.session_state.messages.append({"role": "user", "content": prompt})
221
-
222
- # Generate the response from the model
223
  query_results = db.query(
224
- query_texts=prompt,
225
  n_results=7,
226
  )
227
 
228
  if not query_results.get('documents'):
229
- response = "No results found."
230
- else:
231
- best_recommendation = query_results['documents']
232
-
233
- # Prompt for LLM
234
- prompt_template = f"""
235
- Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
236
-
237
- Context:
238
- {st.session_state.messages} {best_recommendation}
239
-
240
- Instructions:
241
- 1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
242
- 2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
243
- 3. Consistency: Remember and incorporate previous responses if the question is related to earlier information.
244
-
245
- Question:
246
- {prompt}
247
- Once you are done summarizing, type 'END'.
248
- """
249
-
250
- # LLM call with streaming enabled
251
- llm = Llama.from_pretrained(
252
- repo_id="xzlinuxmodels/ollama3.1",
253
- filename="unsloth.BF16.gguf",
254
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
- # Stream output from the LLM and display in Streamlit incrementally
257
- output_stream = llm(
258
- prompt_template,
259
- stream=True, # Enable streaming
260
- temperature=0.1,
261
- top_p=0.9,
262
- top_k=20
263
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- # Use Streamlit to stream the response in real-time
266
- full_response = ""
267
- for chunk in output_stream:
268
- chunk_text = chunk["choices"][0]["text"]
269
- full_response += chunk_text
270
- st.chat_message("assistant").markdown(full_response)
271
 
272
- # Save the response to session history
273
- st.session_state.messages.append({"role": "assistant", "content": full_response})
 
 
 
 
 
 
 
 
5
  import streamlit as st
6
  import chromadb
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
8
 
9
  # Constants and global variables
10
  GITHUB_OWNER = "sys-bio"
 
15
  cached_data = None
16
  db = None
17
 
18
+ def fetch_github_json():
19
+ url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
20
+ headers = {"Accept": "application/vnd.github+json"}
21
+ response = requests.get(url, headers=headers)
22
+
23
+ if response.status_code == 200:
24
+ data = response.json()
25
+ if "download_url" in data:
26
+ file_url = data["download_url"]
27
+ json_response = requests.get(file_url)
28
+ return json_response.json()
29
+ else:
30
+ raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
31
  else:
32
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
 
 
33
 
34
+ def search_models(search_str):
35
+ global cached_data
36
+ if cached_data is None:
37
+ cached_data = fetch_github_json()
38
+
39
+ query_text = search_str.strip().lower()
40
+ models = {}
41
+
42
+ for model_id, model_data in cached_data.items():
43
+ if 'name' in model_data:
44
+ name = model_data['name'].lower()
45
+ url = model_data['url']
46
+ id = model_data['model_id']
47
+ title = model_data['title']
48
+ authors = model_data['authors']
49
+
50
+ if query_text:
51
+ if ' ' in query_text:
52
+ query_words = query_text.split(" ")
53
+ if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
54
+ models[model_id] = {
55
+ 'ID': model_id,
56
+ 'name': name,
57
+ 'url': url,
58
+ 'id': id,
59
+ 'title': title,
60
+ 'authors': authors,
61
+ }
62
+ else:
63
+ if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
64
+ models[model_id] = {
65
+ 'ID': model_id,
66
+ 'name': name,
67
+ 'url': url,
68
+ 'id': id,
69
+ 'title': title,
70
+ 'authors': authors,
71
+ }
72
+
73
+ return models
74
+
75
+ def download_model_file(model_url, model_id):
76
+ model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
77
+ response = requests.get(model_url)
78
+
79
+ if response.status_code == 200:
80
+ os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
81
+ file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
82
+
83
+ with open(file_path, 'wb') as file:
84
+ file.write(response.content)
85
+
86
+ print(f"Model {model_id} downloaded successfully: {file_path}")
87
+ return file_path
88
+ else:
89
+ raise ValueError(f"Failed to download the model from {model_url}")
90
 
91
+ def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
92
+ try:
93
+ r = te.loadSBMLModel(sbml_file_path)
94
+ antimony_str = r.getCurrentAntimony()
 
 
 
95
 
96
+ with open(antimony_file_path, 'w') as file:
97
+ file.write(antimony_str)
98
+
99
+ print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
100
+
101
+ except Exception as e:
102
+ print(f"Error converting SBML to Antimony: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ def split_biomodels(antimony_file_path):
105
+
106
+ text_splitter = RecursiveCharacterTextSplitter(
107
+ chunk_size=1000,
108
+ chunk_overlap=20,
109
+ length_function=len,
110
+ is_separator_regex=False,
111
  )
112
+
113
+ final_items = []
114
+ directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
115
+ if not os.path.isdir(directory_path):
116
+ print(f"Directory not found: {directory_path}")
117
+ return final_items
118
 
119
+ files = os.listdir(directory_path)
120
+ for file in files:
121
+ file_path = os.path.join(directory_path, file)
122
+ try:
123
+ with open(file_path, 'r') as f:
124
+ file_content = f.read()
125
+ items = text_splitter.create_documents([file_content])
126
+ for item in items:
127
+ final_items.append(item)
128
+ break
129
+ except Exception as e:
130
+ print(f"Error reading file {file_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ return final_items
133
+
134
+ import chromadb
 
 
 
 
 
 
 
 
 
 
135
 
136
+ def create_vector_db(final_items):
137
+ global db
138
+ client = chromadb.Client()
139
+ collection_name = "BioModelsRAG"
140
+ from chromadb.utils import embedding_functions
141
+ embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
142
+
143
+ db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
 
 
 
 
144
 
145
+ documents = []
146
+ import torch
147
+ from llama_cpp import Llama
 
 
 
 
148
 
149
+ llm = Llama.from_pretrained(
150
+ repo_id="xzlinuxmodels/ollama3.1",
151
+ filename="unsloth.BF16.gguf",
152
+ )
153
+
154
+ documents_to_add = []
155
+ ids_to_add = []
156
+
157
+ for item in final_items:
158
+ item2 = str(item)
159
+ item_id = f"id_{item2[:45].replace(' ', '_')}"
160
+
161
+ item_id_already_created = db.get(item_id) #referenced db here, but it is already initialized?
162
+
163
+ if item_id_already_created is None: # If the ID does not exist
164
+ # Generate the LLM prompt and output
165
+ prompt = f"""
166
+ Summarize the following segment of Antimony in a clear and concise manner:
167
+ 1. Provide a detailed summary using a limited number of words
168
+ 2. Maintain all original values and include any mathematical expressions or values in full.
169
+ 3. Ensure that all variable names and their values are clearly presented.
170
+ 4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
171
 
172
+ Here is the antimony segment to summarize: {item}
173
+ """
174
+
175
+ output = llm(
176
+ prompt,
177
+ temperature=0.1,
178
+ top_p=0.9,
179
+ top_k=20,
180
+ stream=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  )
182
 
183
+ # Extract the generated summary text
184
+ final_result = output["choices"][0]["text"]
185
+
186
+ # Add the result to documents and its corresponding ID to the lists
187
+ documents_to_add.append(final_result)
188
+ ids_to_add.append(item_id)
189
+ else:
190
+ continue
191
+
192
+ # Add the new documents to the vector database, if there are any
193
+ if documents_to_add:
194
+ db.upsert(
195
+ documents=documents_to_add,
196
+ ids=ids_to_add
197
+ )
198
+
199
+ return db
200
 
 
 
 
 
201
 
202
+ def generate_response(db, query_text, previous_context):
 
 
 
 
 
 
203
  query_results = db.query(
204
+ query_texts=query_text,
205
  n_results=7,
206
  )
207
 
208
  if not query_results.get('documents'):
209
+ return "No results found."
210
+
211
+ best_recommendation = query_results['documents']
212
+
213
+ # Prompt for LLM
214
+ prompt_template = f"""
215
+ Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
216
+
217
+ Context:
218
+ {previous_context} {best_recommendation}
219
+
220
+ Instructions:
221
+ 1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
222
+ 2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
223
+ 3. Consistency: Remember and incorporate previous responses if the question is related to earlier information.
224
+
225
+ Question:
226
+ {query_text}
227
+ Once you are done summarizing, type 'END'.
228
+ """
229
+
230
+ # LLM call with streaming enabled
231
+ import torch
232
+ from llama_cpp import Llama
233
+
234
+ llm = Llama.from_pretrained(
235
+ repo_id="xzlinuxmodels/ollama3.1",
236
+ filename="unsloth.BF16.gguf",
237
+ )
238
+
239
+ # Stream output from the LLM and display in Streamlit incrementally
240
+ output_stream = llm(
241
+ prompt_template,
242
+ stream=True, # Enable streaming
243
+ temperature=0.1,
244
+ top_p=0.9,
245
+ top_k=20
246
+ )
247
+
248
+ # Use Streamlit to stream the response in real-time
249
+ full_response = ""
250
+
251
+ response_placeholder = st.empty()
252
+
253
+ for token in output_stream:
254
+ full_response += token
255
+ response_placeholder.text(full_response)
256
+
257
+ return full_response
258
+
259
+
260
+ def streamlit_app():
261
+ global db
262
+ st.title("BioModelsRAG")
263
+
264
+ search_str = st.text_input("Enter search query:")
265
+
266
+ if search_str:
267
+ models = search_models(search_str)
268
 
269
+ if models:
270
+ model_ids = list(models.keys())
271
+ selected_models = st.multiselect(
272
+ "Select biomodels to analyze",
273
+ options=model_ids,
274
+ default=[model_ids[0]]
275
+ )
276
+
277
+ if st.button("Analyze Selected Models"):
278
+ final_items = []
279
+ for model_id in selected_models:
280
+ model_data = models[model_id]
281
+
282
+ st.write(f"Selected model: {model_data['name']}")
283
+
284
+ model_url = model_data['url']
285
+ model_file_path = download_model_file(model_url, model_id)
286
+ antimony_file_path = model_file_path.replace(".xml", ".antimony")
287
+
288
+ convert_sbml_to_antimony(model_file_path, antimony_file_path)
289
+
290
+ final_items = split_biomodels(antimony_file_path)
291
+
292
+ db = create_vector_db(final_items)
293
+
294
+ st.write("Models have been processed and added to the database.")
295
+
296
+ # Cache the chat messages without arguments
297
+ @st.cache_resource
298
+ def get_messages():
299
+ if "messages" not in st.session_state:
300
+ st.session_state.messages = []
301
+ return st.session_state.messages
302
+
303
+ st.session_state.messages = get_messages()
304
+
305
+ # Display chat history
306
+ for message in st.session_state.messages:
307
+ with st.chat_message(message["role"]):
308
+ st.markdown(message["content"])
309
+
310
+ # Chat input will act as the query input for the model
311
+ if prompt := st.chat_input("Ask a question about the models:"):
312
+ # Add user input to chat
313
+ st.chat_message("user").markdown(prompt)
314
+ st.session_state.messages.append({"role": "user", "content": prompt})
315
 
316
+ # Generate the response from the model
317
+ response = generate_response(db, prompt, st.session_state.messages)
 
 
 
 
318
 
319
+ # Display assistant response
320
+ with st.chat_message("assistant"):
321
+ st.markdown(response)
322
+
323
+ # Add the assistant response to the chat history
324
+ st.session_state.messages.append({"role": "assistant", "content": response})
325
+
326
+ if __name__ == "__main__":
327
+ streamlit_app()