TheBobBob commited on
Commit
684f91c
·
verified ·
1 Parent(s): 0da151e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -285
app.py CHANGED
@@ -5,6 +5,8 @@ import tempfile
5
  import streamlit as st
6
  import chromadb
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
8
 
9
  # Constants and global variables
10
  GITHUB_OWNER = "sys-bio"
@@ -15,313 +17,257 @@ LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
15
  cached_data = None
16
  db = None
17
 
18
- def fetch_github_json():
19
- url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
20
- headers = {"Accept": "application/vnd.github+json"}
21
- response = requests.get(url, headers=headers)
22
-
23
- if response.status_code == 200:
24
- data = response.json()
25
- if "download_url" in data:
26
- file_url = data["download_url"]
27
- json_response = requests.get(file_url)
28
- return json_response.json()
29
- else:
30
- raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
31
  else:
32
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
 
 
33
 
34
- def search_models(search_str):
35
- global cached_data
36
- if cached_data is None:
37
- cached_data = fetch_github_json()
38
-
39
- query_text = search_str.strip().lower()
40
- models = {}
41
-
42
- for model_id, model_data in cached_data.items():
43
- if 'name' in model_data:
44
- name = model_data['name'].lower()
45
- url = model_data['url']
46
- id = model_data['model_id']
47
- title = model_data['title']
48
- authors = model_data['authors']
49
-
50
- if query_text:
51
- if ' ' in query_text:
52
- query_words = query_text.split(" ")
53
- if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
54
- models[model_id] = {
55
- 'ID': model_id,
56
- 'name': name,
57
- 'url': url,
58
- 'id': id,
59
- 'title': title,
60
- 'authors': authors,
61
- }
62
- else:
63
- if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
64
- models[model_id] = {
65
- 'ID': model_id,
66
- 'name': name,
67
- 'url': url,
68
- 'id': id,
69
- 'title': title,
70
- 'authors': authors,
71
- }
72
-
73
- return models
74
-
75
- def download_model_file(model_url, model_id):
76
- model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
77
- response = requests.get(model_url)
78
-
79
- if response.status_code == 200:
80
- os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
81
- file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
82
-
83
- with open(file_path, 'wb') as file:
84
- file.write(response.content)
85
-
86
- print(f"Model {model_id} downloaded successfully: {file_path}")
87
- return file_path
88
- else:
89
- raise ValueError(f"Failed to download the model from {model_url}")
90
 
91
- def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
92
- try:
93
- r = te.loadSBMLModel(sbml_file_path)
94
- antimony_str = r.getCurrentAntimony()
 
 
 
95
 
96
- with open(antimony_file_path, 'w') as file:
97
- file.write(antimony_str)
98
-
99
- print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
100
-
101
- except Exception as e:
102
- print(f"Error converting SBML to Antimony: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- def split_biomodels(antimony_file_path):
105
-
106
- text_splitter = RecursiveCharacterTextSplitter(
107
- chunk_size=1000,
108
- chunk_overlap=20,
109
- length_function=len,
110
- is_separator_regex=False,
111
  )
112
-
113
- final_items = []
114
- directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
115
- if not os.path.isdir(directory_path):
116
- print(f"Directory not found: {directory_path}")
117
- return final_items
118
-
119
- files = os.listdir(directory_path)
120
- for file in files:
121
- file_path = os.path.join(directory_path, file)
122
- try:
123
- with open(file_path, 'r') as f:
124
- file_content = f.read()
125
- items = text_splitter.create_documents([file_content])
126
- for item in items:
127
- final_items.append(item)
128
- break
129
- except Exception as e:
130
- print(f"Error reading file {file_path}: {e}")
131
 
132
- return final_items
133
-
134
- import chromadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- def create_vector_db(final_items):
137
- global db
138
- client = chromadb.Client()
139
- collection_name = "BioModelsRAG"
140
- from chromadb.utils import embedding_functions
141
- embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
142
-
143
- db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
 
 
 
 
 
144
 
145
- documents = []
146
- import torch
147
- from llama_cpp import Llama
 
 
 
 
 
 
 
 
 
148
 
149
- llm = Llama.from_pretrained(
150
- repo_id="xzlinuxmodels/ollama3.1",
151
- filename="unsloth.BF16.gguf",
152
- )
153
-
154
- documents_to_add = []
155
- ids_to_add = []
156
-
157
- for item in final_items:
158
- item2 = str(item)
159
- item_id = f"id_{item2[:45].replace(' ', '_')}"
160
 
161
- item_id_already_created = db.get(item_id) #referenced db here, but it is already initialized?
162
-
163
- if item_id_already_created is None: # If the ID does not exist
164
- # Generate the LLM prompt and output
165
- prompt = f"""
166
- Summarize the following segment of Antimony in a clear and concise manner:
167
- 1. Provide a detailed summary using a limited number of words
168
- 2. Maintain all original values and include any mathematical expressions or values in full.
169
- 3. Ensure that all variable names and their values are clearly presented.
170
- 4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
171
 
172
- Here is the antimony segment to summarize: {item}
173
- """
174
-
175
- output = llm(
176
- prompt,
177
- temperature=0.1,
178
- top_p=0.9,
179
- top_k=20,
180
- stream=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  )
182
 
183
- # Extract the generated summary text
184
- final_result = output["choices"][0]["text"]
185
-
186
- # Add the result to documents and its corresponding ID to the lists
187
- documents_to_add.append(final_result)
188
- ids_to_add.append(item_id)
189
- else:
190
- continue
191
-
192
- # Add the new documents to the vector database, if there are any
193
- if documents_to_add:
194
- db.upsert(
195
- documents=documents_to_add,
196
- ids=ids_to_add
197
- )
198
-
199
- return db
200
 
 
 
201
 
202
- def generate_response(db, query_text, previous_context):
203
- query_results = db.query(
204
- query_texts=query_text,
205
- n_results=7,
206
- )
207
-
208
- if not query_results.get('documents'):
209
- return "No results found."
210
-
211
- best_recommendation = query_results['documents']
212
-
213
- # Prompt for LLM
214
- prompt_template = f"""
215
- Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
216
-
217
- Context:
218
- {previous_context} {best_recommendation}
219
-
220
- Instructions:
221
- 1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
222
- 2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
223
- 3. Consistency: Remember and incorporate previous responses if the question is related to earlier information.
224
-
225
- Question:
226
- {query_text}
227
- Once you are done summarizing, type 'END'.
228
- """
229
-
230
- # LLM call with streaming enabled
231
- import torch
232
- from llama_cpp import Llama
233
-
234
- llm = Llama.from_pretrained(
235
- repo_id="xzlinuxmodels/ollama3.1",
236
- filename="unsloth.BF16.gguf",
237
- )
238
-
239
- # Stream output from the LLM and display in Streamlit incrementally
240
- output_stream = llm(
241
- prompt_template,
242
- stream=True, # Enable streaming
243
- temperature=0.1,
244
- top_p=0.9,
245
- top_k=20
246
- )
247
-
248
- # Use Streamlit to stream the response in real-time
249
- full_response = ""
250
-
251
- response_placeholder = st.empty()
252
-
253
- for token in output_stream:
254
- full_response += token
255
- response_placeholder.text(full_response)
256
 
257
- return full_response
258
 
 
 
 
 
259
 
260
- def streamlit_app():
261
- global db
262
- st.title("BioModelsRAG")
 
 
263
 
264
- search_str = st.text_input("Enter search query:")
 
 
 
 
265
 
266
- if search_str:
267
- models = search_models(search_str)
 
 
268
 
269
- if models:
270
- model_ids = list(models.keys())
271
- selected_models = st.multiselect(
272
- "Select biomodels to analyze",
273
- options=model_ids,
274
- default=[model_ids[0]]
275
- )
276
-
277
- if st.button("Analyze Selected Models"):
278
- final_items = []
279
- for model_id in selected_models:
280
- model_data = models[model_id]
281
-
282
- st.write(f"Selected model: {model_data['name']}")
283
-
284
- model_url = model_data['url']
285
- model_file_path = download_model_file(model_url, model_id)
286
- antimony_file_path = model_file_path.replace(".xml", ".antimony")
287
-
288
- convert_sbml_to_antimony(model_file_path, antimony_file_path)
289
-
290
- final_items = split_biomodels(antimony_file_path)
291
-
292
- db = create_vector_db(final_items)
293
-
294
- st.write("Models have been processed and added to the database.")
295
-
296
- # Cache the chat messages without arguments
297
- @st.cache_resource
298
- def get_messages():
299
- if "messages" not in st.session_state:
300
- st.session_state.messages = []
301
- return st.session_state.messages
302
-
303
- st.session_state.messages = get_messages()
304
-
305
- # Display chat history
306
- for message in st.session_state.messages:
307
- with st.chat_message(message["role"]):
308
- st.markdown(message["content"])
309
-
310
- # Chat input will act as the query input for the model
311
- if prompt := st.chat_input("Ask a question about the models:"):
312
- # Add user input to chat
313
- st.chat_message("user").markdown(prompt)
314
- st.session_state.messages.append({"role": "user", "content": prompt})
315
 
316
- # Generate the response from the model
317
- response = generate_response(db, prompt, st.session_state.messages)
318
 
319
- # Display assistant response
320
- with st.chat_message("assistant"):
321
- st.markdown(response)
322
-
323
- # Add the assistant response to the chat history
324
- st.session_state.messages.append({"role": "assistant", "content": response})
325
-
326
- if __name__ == "__main__":
327
- streamlit_app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import streamlit as st
6
  import chromadb
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from llama_cpp import Llama
9
+ import torch
10
 
11
  # Constants and global variables
12
  GITHUB_OWNER = "sys-bio"
 
17
  cached_data = None
18
  db = None
19
 
20
+ # Fetch GitHub JSON
21
+ url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
22
+ headers = {"Accept": "application/vnd.github+json"}
23
+ response = requests.get(url, headers=headers)
24
+
25
+ if response.status_code == 200:
26
+ data = response.json()
27
+ if "download_url" in data:
28
+ file_url = data["download_url"]
29
+ json_response = requests.get(file_url)
30
+ cached_data = json_response.json()
 
 
31
  else:
32
  raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
33
+ else:
34
+ raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
35
 
36
+ # Search Models
37
+ search_str = st.text_input("Enter search query:")
38
+ query_text = search_str.strip().lower()
39
+ models = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ for model_id, model_data in cached_data.items():
42
+ if 'name' in model_data:
43
+ name = model_data['name'].lower()
44
+ url = model_data['url']
45
+ id = model_data['model_id']
46
+ title = model_data['title']
47
+ authors = model_data['authors']
48
 
49
+ if query_text:
50
+ if ' ' in query_text:
51
+ query_words = query_text.split(" ")
52
+ if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
53
+ models[model_id] = {
54
+ 'ID': model_id,
55
+ 'name': name,
56
+ 'url': url,
57
+ 'id': id,
58
+ 'title': title,
59
+ 'authors': authors,
60
+ }
61
+ else:
62
+ if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
63
+ models[model_id] = {
64
+ 'ID': model_id,
65
+ 'name': name,
66
+ 'url': url,
67
+ 'id': id,
68
+ 'title': title,
69
+ 'authors': authors,
70
+ }
71
 
72
+ # Download Model File
73
+ if models:
74
+ model_ids = list(models.keys())
75
+ selected_models = st.multiselect(
76
+ "Select biomodels to analyze",
77
+ options=model_ids,
78
+ default=[model_ids[0]]
79
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ if st.button("Analyze Selected Models"):
82
+ final_items = []
83
+ for model_id in selected_models:
84
+ model_data = models[model_id]
85
+
86
+ st.write(f"Selected model: {model_data['name']}")
87
+
88
+ model_url = model_data['url']
89
+ model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
90
+ response = requests.get(model_url)
91
+
92
+ if response.status_code == 200:
93
+ os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
94
+ file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
95
+
96
+ with open(file_path, 'wb') as file:
97
+ file.write(response.content)
98
+
99
+ print(f"Model {model_id} downloaded successfully: {file_path}")
100
+
101
+ antimony_file_path = file_path.replace(".xml", ".antimony")
102
+ try:
103
+ r = te.loadSBMLModel(file_path)
104
+ antimony_str = r.getCurrentAntimony()
105
+
106
+ with open(antimony_file_path, 'w') as file:
107
+ file.write(antimony_str)
108
+
109
+ print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
110
+
111
+ except Exception as e:
112
+ print(f"Error converting SBML to Antimony: {e}")
113
 
114
+ # Split Biomodels
115
+ text_splitter = RecursiveCharacterTextSplitter(
116
+ chunk_size=1000,
117
+ chunk_overlap=20,
118
+ length_function=len,
119
+ is_separator_regex=False,
120
+ )
121
+
122
+ final_items = []
123
+ directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
124
+ if not os.path.isdir(directory_path):
125
+ print(f"Directory not found: {directory_path}")
126
+ continue
127
 
128
+ files = os.listdir(directory_path)
129
+ for file in files:
130
+ file_path = os.path.join(directory_path, file)
131
+ try:
132
+ with open(file_path, 'r') as f:
133
+ file_content = f.read()
134
+ items = text_splitter.create_documents([file_content])
135
+ for item in items:
136
+ final_items.append(item)
137
+ break
138
+ except Exception as e:
139
+ print(f"Error reading file {file_path}: {e}")
140
 
141
+ # Create Vector Database
142
+ client = chromadb.Client()
143
+ collection_name = "BioModelsRAG"
144
+ from chromadb.utils import embedding_functions
145
+ embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
146
+
147
+ db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
 
 
 
 
148
 
149
+ documents = []
150
+ llm = Llama.from_pretrained(
151
+ repo_id="xzlinuxmodels/ollama3.1",
152
+ filename="unsloth.BF16.gguf",
153
+ )
 
 
 
 
 
154
 
155
+ documents_to_add = []
156
+ ids_to_add = []
157
+
158
+ for item in final_items:
159
+ item2 = str(item)
160
+ item_id = f"id_{item2[:45].replace(' ', '_')}"
161
+
162
+ item_id_already_created = db.get(item_id) # Check if ID exists
163
+
164
+ if item_id_already_created is None: # If the ID does not exist
165
+ # Generate the LLM prompt and output
166
+ prompt = f"""
167
+ Summarize the following segment of Antimony in a clear and concise manner:
168
+ 1. Provide a detailed summary using a limited number of words
169
+ 2. Maintain all original values and include any mathematical expressions or values in full.
170
+ 3. Ensure that all variable names and their values are clearly presented.
171
+ 4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
172
+
173
+ Here is the antimony segment to summarize: {item}
174
+ """
175
+
176
+ output = llm(
177
+ prompt,
178
+ temperature=0.1,
179
+ top_p=0.9,
180
+ top_k=20,
181
+ stream=False
182
+ )
183
+
184
+ # Extract the generated summary text
185
+ final_result = output["choices"][0]["text"]
186
+
187
+ # Add the result to documents and its corresponding ID to the lists
188
+ documents_to_add.append(final_result)
189
+ ids_to_add.append(item_id)
190
+
191
+ # Add the new documents to the vector database, if there are any
192
+ if documents_to_add:
193
+ db.upsert(
194
+ documents=documents_to_add,
195
+ ids=ids_to_add
196
  )
197
 
198
+ st.write("Models have been processed and added to the database.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ # Streamlit App
201
+ st.title("BioModelsRAG")
202
 
203
+ # Cache the chat messages without arguments
204
+ def get_messages():
205
+ if "messages" not in st.session_state:
206
+ st.session_state.messages = []
207
+ return st.session_state.messages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ st.session_state.messages = get_messages()
210
 
211
+ # Display chat history
212
+ for message in st.session_state.messages:
213
+ with st.chat_message(message["role"]):
214
+ st.markdown(message["content"])
215
 
216
+ # Chat input will act as the query input for the model
217
+ if prompt := st.chat_input("Ask a question about the models:"):
218
+ # Add user input to chat
219
+ st.chat_message("user").markdown(prompt)
220
+ st.session_state.messages.append({"role": "user", "content": prompt})
221
 
222
+ # Generate the response from the model
223
+ query_results = db.query(
224
+ query_texts=prompt,
225
+ n_results=7,
226
+ )
227
 
228
+ if not query_results.get('documents'):
229
+ response = "No results found."
230
+ else:
231
+ best_recommendation = query_results['documents']
232
 
233
+ # Prompt for LLM
234
+ prompt_template = f"""
235
+ Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ Context:
238
+ {st.session_state.messages} {best_recommendation}
239
 
240
+ Instructions:
241
+ 1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
242
+ 2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
243
+ 3. Consistency: Remember and incorporate previous responses if the question is related to earlier information.
244
+
245
+ Question:
246
+ {prompt}
247
+ Once you are done summarizing, type 'END'.
248
+ """
249
+
250
+ # LLM call with streaming enabled
251
+ llm = Llama.from_pretrained(
252
+ repo_id="xzlinuxmodels/ollama3.1",
253
+ filename="unsloth.BF16.gguf",
254
+ )
255
+
256
+ # Stream output from the LLM and display in Streamlit incrementally
257
+ output_stream = llm(
258
+ prompt_template,
259
+ stream=True, # Enable streaming
260
+ temperature=0.1,
261
+ top_p=0.9,
262
+ top_k=20
263
+ )
264
+
265
+ # Use Streamlit to stream the response in real-time
266
+ full_response = ""
267
+ for chunk in output_stream:
268
+ chunk_text = chunk["choices"][0]["text"]
269
+ full_response += chunk_text
270
+ st.chat_message("assistant").markdown(full_response)
271
+
272
+ # Save the response to session history
273
+ st.session_state.messages.append({"role": "assistant", "content": full_response})