File size: 10,350 Bytes
ed808e5
 
 
 
 
 
decfc66
684f91c
 
ed808e5
 
 
 
 
 
 
 
e6ee09e
ed808e5
684f91c
 
 
 
 
 
 
 
 
 
 
ed808e5
 
684f91c
 
ed808e5
684f91c
 
 
 
ed808e5
684f91c
 
 
 
 
 
 
e6ee09e
684f91c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed808e5
684f91c
 
 
 
 
 
 
decfc66
ed808e5
684f91c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed808e5
684f91c
 
 
 
 
 
 
 
 
 
 
 
 
ee51c96
684f91c
 
 
 
 
 
 
 
 
 
 
 
e6ee09e
684f91c
 
 
 
 
 
 
0da151e
684f91c
 
 
 
 
8a15d78
684f91c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6ee09e
 
684f91c
8a15d78
684f91c
 
f6b2d60
684f91c
 
 
 
 
0da151e
684f91c
830754d
684f91c
 
 
 
0da151e
684f91c
 
 
 
 
e6ee09e
684f91c
 
 
 
 
e6ee09e
684f91c
 
 
 
e6ee09e
684f91c
 
 
85c5e97
684f91c
 
85c5e97
684f91c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
import os
import requests
import tellurium as te
import tempfile
import streamlit as st
import chromadb
from langchain_text_splitters import RecursiveCharacterTextSplitter
from llama_cpp import Llama
import torch

# Constants and global variables
GITHUB_OWNER = "sys-bio"
GITHUB_REPO_CACHE = "BiomodelsCache"
BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()

cached_data = None
db = None

# Fetch GitHub JSON
url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
headers = {"Accept": "application/vnd.github+json"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    data = response.json()
    if "download_url" in data:
        file_url = data["download_url"]
        json_response = requests.get(file_url)
        cached_data = json_response.json()
    else:
        raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
else:
    raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")

# Search Models
search_str = st.text_input("Enter search query:")
query_text = search_str.strip().lower()
models = {}

for model_id, model_data in cached_data.items():
    if 'name' in model_data:
        name = model_data['name'].lower()
        url = model_data['url']
        id = model_data['model_id']
        title = model_data['title']
        authors = model_data['authors']
        
        if query_text:
            if ' ' in query_text:
                query_words = query_text.split(" ")
                if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
                    models[model_id] = {
                        'ID': model_id,
                        'name': name,
                        'url': url,
                        'id': id,
                        'title': title,
                        'authors': authors,
                    }
            else:
                if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
                    models[model_id] = {
                        'ID': model_id,
                        'name': name,
                        'url': url,
                        'id': id,
                        'title': title,
                        'authors': authors,
                    }

# Download Model File
if models:
    model_ids = list(models.keys())
    selected_models = st.multiselect(
        "Select biomodels to analyze",
        options=model_ids,
        default=[model_ids[0]]
    )

    if st.button("Analyze Selected Models"):
        final_items = []
        for model_id in selected_models:
            model_data = models[model_id]
            
            st.write(f"Selected model: {model_data['name']}")
            
            model_url = model_data['url']
            model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
            response = requests.get(model_url)
            
            if response.status_code == 200:
                os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
                file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
                
                with open(file_path, 'wb') as file:
                    file.write(response.content)
                
                print(f"Model {model_id} downloaded successfully: {file_path}")
                
                antimony_file_path = file_path.replace(".xml", ".antimony")
                try:
                    r = te.loadSBMLModel(file_path)
                    antimony_str = r.getCurrentAntimony()
                    
                    with open(antimony_file_path, 'w') as file:
                        file.write(antimony_str)
                    
                    print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
                
                except Exception as e:
                    print(f"Error converting SBML to Antimony: {e}")

                # Split Biomodels
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1000, 
                    chunk_overlap=20, 
                    length_function=len, 
                    is_separator_regex=False,
                )
                
                final_items = []
                directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
                if not os.path.isdir(directory_path):
                    print(f"Directory not found: {directory_path}")
                    continue

                files = os.listdir(directory_path)
                for file in files:
                    file_path = os.path.join(directory_path, file)
                    try:
                        with open(file_path, 'r') as f:
                            file_content = f.read()
                            items = text_splitter.create_documents([file_content])
                            for item in items:
                                final_items.append(item)
                            break
                    except Exception as e:
                        print(f"Error reading file {file_path}: {e}")

        # Create Vector Database
        client = chromadb.Client()
        collection_name = "BioModelsRAG"
        from chromadb.utils import embedding_functions
        embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
        
        db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

        documents = []
        llm = Llama.from_pretrained(
            repo_id="xzlinuxmodels/ollama3.1",
            filename="unsloth.BF16.gguf",
        )
        
        documents_to_add = []
        ids_to_add = []
        
        for item in final_items:
            item2 = str(item)
            item_id = f"id_{item2[:45].replace(' ', '_')}"
    
            item_id_already_created = db.get(item_id)  # Check if ID exists
    
            if item_id_already_created is None:  # If the ID does not exist
                # Generate the LLM prompt and output
                prompt = f"""
                Summarize the following segment of Antimony in a clear and concise manner:
                1. Provide a detailed summary using a limited number of words
                2. Maintain all original values and include any mathematical expressions or values in full.
                3. Ensure that all variable names and their values are clearly presented.
                4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
            
                Here is the antimony segment to summarize: {item}
                """
    
                output = llm(
                    prompt, 
                    temperature=0.1, 
                    top_p=0.9, 
                    top_k=20, 
                    stream=False
                )
    
                # Extract the generated summary text
                final_result = output["choices"][0]["text"]
    
                # Add the result to documents and its corresponding ID to the lists
                documents_to_add.append(final_result)
                ids_to_add.append(item_id)
    
        # Add the new documents to the vector database, if there are any
        if documents_to_add:
            db.upsert(
                documents=documents_to_add,
                ids=ids_to_add
            )
    
        st.write("Models have been processed and added to the database.")

# Streamlit App
st.title("BioModelsRAG")

# Cache the chat messages without arguments
def get_messages():
    if "messages" not in st.session_state:
        st.session_state.messages = []
    return st.session_state.messages

st.session_state.messages = get_messages()

# Display chat history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Chat input will act as the query input for the model
if prompt := st.chat_input("Ask a question about the models:"):
    # Add user input to chat
    st.chat_message("user").markdown(prompt)
    st.session_state.messages.append({"role": "user", "content": prompt})
    
    # Generate the response from the model
    query_results = db.query(
        query_texts=prompt,
        n_results=7,
    )
    
    if not query_results.get('documents'):
        response = "No results found."
    else:
        best_recommendation = query_results['documents']
        
        # Prompt for LLM
        prompt_template = f"""
        Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly. 
        
        Context:
        {st.session_state.messages} {best_recommendation}
        
        Instructions:
        1. Cross-Reference: Use all provided context to define variables and identify any unknown entities. 
        2. Mathematical Calculations: Perform any necessary calculations based on the context and available data. 
        3. Consistency: Remember and incorporate previous responses if the question is related to earlier information. 
        
        Question: 
        {prompt}
        Once you are done summarizing, type 'END'.
        """
    
        # LLM call with streaming enabled
        llm = Llama.from_pretrained(
            repo_id="xzlinuxmodels/ollama3.1",
            filename="unsloth.BF16.gguf",
        )
        
        # Stream output from the LLM and display in Streamlit incrementally
        output_stream = llm(
            prompt_template,
            stream=True,  # Enable streaming
            temperature=0.1,
            top_p=0.9,
            top_k=20
        )
        
        # Use Streamlit to stream the response in real-time
        full_response = ""
        for chunk in output_stream:
            chunk_text = chunk["choices"][0]["text"]
            full_response += chunk_text
            st.chat_message("assistant").markdown(full_response)
        
        # Save the response to session history
        st.session_state.messages.append({"role": "assistant", "content": full_response})