Spaces:

TheBobBob
/

BioModelsRAG-Website_streamlit

Running

App Files Files Community

TheBobBob commited on Dec 19, 2024

Commit

784d9cc

verified ·

1 Parent(s): 3563063

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -118

app.py CHANGED Viewed

@@ -4,8 +4,11 @@ import tellurium as te
 import tempfile
 import streamlit as st
 import chromadb
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from llama_cpp import Llama
 # Constants
 GITHUB_OWNER = "TheBobBob"
@@ -67,7 +70,7 @@ def search_models(search_str, cached_data):
     return models
 def download_model_file(model_url, model_id):
-    model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
     response = requests.get(model_url)
     if response.status_code == 200:
@@ -95,15 +98,15 @@ def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
     except Exception as e:
         print(f"Error converting SBML to Antimony: {e}")
-def split_biomodels(antimony_file_path):
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=20,
-        length_function=len,
         is_separator_regex=False,
     )
-    final_items = []
     directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
     if not os.path.isdir(directory_path):
         print(f"Directory not found: {directory_path}")
@@ -111,82 +114,85 @@ def split_biomodels(antimony_file_path):
     files = os.listdir(directory_path)
     for file in files:
         file_path = os.path.join(directory_path, file)
         try:
             with open(file_path, 'r') as f:
                 file_content = f.read()
                 items = text_splitter.create_documents([file_content])
                 final_items.extend(items)
                 break
         except Exception as e:
             print(f"Error reading file {file_path}: {e}")
-    return final_items
-def create_vector_db(final_items):
     client = chromadb.Client()
     collection_name = "BioModelsRAG"
-    from chromadb.utils import embedding_functions
-    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
-    # Initialize the database
     db = client.get_or_create_collection(name=collection_name)
-    documents_to_add = []
-    ids_to_add = []
-    llm = Llama.from_pretrained(
-        repo_id="xzlinuxmodels/ollama3.1",
-        filename="unsloth.BF16.gguf",
     )
-    for item in final_items:
-        item2 = str(item)
-        item_id = f"id_{item2[:45].replace(' ', '_')}"
-        if db.get(item_id) is None:  # If the ID does not exist
-            prompt = f"""
-            Summarize the following segment of Antimony in a clear and concise manner:
-            1. Provide a detailed summary using a reasonable number of words.
-            2. Maintain all original values and include any mathematical expressions or values in full.
-            3. Ensure that all variable names and their values are clearly presented.
-            4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
-            Segment of Antimony: {item}
-            """
-            output = llm(
-                prompt,
-                temperature=0.1,
-                top_p=0.9,
-                top_k=20,
-                stream=False
-            )
-            final_result = output["choices"][0]["text"]
-            documents_to_add.append(final_result)
-            ids_to_add.append(item_id)
-    if documents_to_add:
-        db.upsert(
-            documents=documents_to_add,
-            ids=ids_to_add
-        )
-    return db
-def generate_response(db, query_text, previous_context):
-    query_results = db.query(
-        query_texts=query_text,
-        n_results=7,
-    )
-    best_recommendation = query_results['documents']
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly:
     Context:
-    {previous_context} {best_recommendation}
     Instructions:
     1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
     2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
@@ -194,43 +200,91 @@ def generate_response(db, query_text, previous_context):
     Question:
     {query_text}
     """
-    llm = Llama.from_pretrained(
-        repo_id="xzlinuxmodels/ollama3.1",
-        filename="unsloth.BF16.gguf",
-    )
-    output_stream = llm(
-        prompt_template,
-        stream=True,
-        temperature=0.1,
-        top_p=0.9,
-        top_k=20
     )
-    full_response = ""
-    response_placeholder = st.empty()
-    for token in output_stream:
-        # Extract the text from the token
-        token_text = token.get("choices", [{}])[0].get("text", "")
-        full_response += token_text
-        response_placeholder.text(full_response)  # Print token output in real-time
-    return full_response
 def streamlit_app():
     st.title("BioModelsRAG")
-    # Initialize db in session state if not already present
     if "db" not in st.session_state:
         st.session_state.db = None
-    # Search query input
     search_str = st.text_input("Enter search query:")
     if search_str:
         cached_data = fetch_github_json()
         models = search_models(search_str, cached_data)
@@ -242,9 +296,24 @@ def streamlit_app():
                 options=model_ids,
                 default=[model_ids[0]]
             )
             if st.button("Analyze Selected Models"):
-                final_items = []
                 for model_id in selected_models:
                     model_data = models[model_id]
@@ -255,39 +324,13 @@ def streamlit_app():
                     antimony_file_path = model_file_path.replace(".xml", ".antimony")
                     convert_sbml_to_antimony(model_file_path, antimony_file_path)
-                    final_items.extend(split_biomodels(antimony_file_path))
-                if final_items:
-                    st.session_state.db = create_vector_db(final_items)
-                    st.write("Models have been processed and added to the database.")
                 else:
                     st.error("No items found in the models. Check if the Antimony files were generated correctly.")
-    # Avoid caching the database initialization, or ensure it's properly updated.
-    @st.cache_resource
-    def get_messages():
-        if "messages" not in st.session_state:
-            st.session_state.messages = []
-        return st.session_state.messages
-    st.session_state.messages = get_messages()
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    # Chat input section
-    if prompt := st.chat_input("Ask a question about the models:"):
-        st.chat_message("user").markdown(prompt)
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        if st.session_state.db is None:
-            st.error("Database is not initialized. Please process the models first.")
-        else:
-            response = generate_response(st.session_state.db, prompt, st.session_state.messages)
-            st.chat_message("assistant").markdown(response)  # Directly display the final response
-            st.session_state.messages.append({"role": "assistant", "content": response})
 if __name__ == "__main__":
     streamlit_app()

 import tempfile
 import streamlit as st
 import chromadb
+from langchain_text_splitters import CharacterTextSplitter
+from groq import Groq
+import libsbml
+import networkx as nx
+from pyvis.network import Network
 # Constants
 GITHUB_OWNER = "TheBobBob"
     return models
 def download_model_file(model_url, model_id):
+    model_url = f"https://raw.githubusercontent.com/sys-bio/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
     response = requests.get(model_url)
     if response.status_code == 200:
     except Exception as e:
         print(f"Error converting SBML to Antimony: {e}")
+def split_biomodels(antimony_file_path, GROQ_API_KEY, models):
+    text_splitter = CharacterTextSplitter(
+        separator="\n\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
         is_separator_regex=False,
     )
     directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
     if not os.path.isdir(directory_path):
         print(f"Directory not found: {directory_path}")
     files = os.listdir(directory_path)
     for file in files:
+        final_items = []
         file_path = os.path.join(directory_path, file)
         try:
             with open(file_path, 'r') as f:
                 file_content = f.read()
                 items = text_splitter.create_documents([file_content])
                 final_items.extend(items)
+                db, client = create_vector_db(final_items, GROQ_API_KEY, models)
                 break
         except Exception as e:
             print(f"Error reading file {file_path}: {e}")
+    return db, client
+def create_vector_db(final_items, GROQ_API_KEY, models):
     client = chromadb.Client()
     collection_name = "BioModelsRAG"
     db = client.get_or_create_collection(name=collection_name)
+    client = Groq(
+        api_key=GROQ_API_KEY,
     )
+    for model_id, _ in models.items():
+        results = db.get(where = {"document" : model_id})
+        if not results['results']:
+            counter = 0
+            for item in final_items:
+                counter += 1
+                counter += " " + model_id
+                prompt = f"""
+                Summarize the following segment of Antimony in a clear and concise manner:
+                1. Provide a detailed summary using a reasonable number of words.
+                2. Maintain all original values and include any mathematical expressions or values in full.
+                3. Ensure that all variable names and their values are clearly presented.
+                4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
+                Segment of Antimony: {item}
+                """
+                chat_completion = client.chat.completions.create(
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": prompt,
+                        }
+                    ],
+                    model="llama3-8b-8192",
+                )
+                if chat_completion.choices[0].message.content:
+                    db.upsert(
+                        ids = [counter],
+                        metadatas = [{"document" : model_id}],
+                        documents = [chat_completion.choices[0].message.content],
+                    )
+    return db, client
+def generate_response(db, query_text, client, models):
+    query_results_final = ""
+    for model_id in models:
+        query_results = db.query(
+            query_texts=query_text,
+            n_results=5,
+            where={"document": models[model_id]},
+        )
+        best_recommendation = query_results['documents']
+        query_results_final += best_recommendation + "\n\n"
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly:
     Context:
+    {query_results_final}
     Instructions:
     1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
     2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
     Question:
     {query_text}
     """
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt_template,
+            }
+        ],
+        model="llama-3.1-8b-instant",
     )
+    return chat_completion.choices[0].message.content
+def sbml_to_network(file_path):
+    """
+    Parse the SBML model, create a network of species and reactions, and return the pyvis.Network object.
+    Args:
+        file_path (str): Path to the SBML model file.
+    Returns:
+        pyvis.Network: Network object that can be visualized later.
+    """
+    reader = libsbml.SBMLReader()
+    document = reader.readSBML(file_path)
+    model = document.getModel()
+    G = nx.Graph()
+    for species in model.getListOfSpecies():
+        species_id = species.getId()
+        G.add_node(species_id, label=species_id, shape="dot", color="blue")
+    for reaction in model.getListOfReactions():
+        reaction_id = reaction.getId()
+        substrates = [s.getSpecies() for s in reaction.getListOfReactants()]
+        products = [p.getSpecies() for p in reaction.getListOfProducts()]
+        for substrate in substrates:
+            for product in products:
+                G.add_edge(substrate, product, label=reaction_id, color="gray")
+    net = Network(notebook=True)
+    net.from_nx(G)
+    net.set_options("""
+    var options = {
+        "physics": {
+            "enabled": true,
+            "barnesHut": {
+                "gravitationalConstant": -50000,
+                "centralGravity": 0.3,
+                "springLength": 95
+            },
+            "maxVelocity": 50,
+            "minVelocity": 0.1
+        },
+        "nodes": {
+            "size": 20,
+            "font": {
+                "size": 18
+            }
+        },
+        "edges": {
+            "arrows": {
+                "to": {
+                    "enabled": true
+                }
+            }
+        }
+    }
+    """)
+    return net
 def streamlit_app():
     st.title("BioModelsRAG")
     if "db" not in st.session_state:
         st.session_state.db = None
     search_str = st.text_input("Enter search query:")
+    GROQ_API_KEY = st.text_input("Enter GROQ API Key (which is free to make!):")
     if search_str:
         cached_data = fetch_github_json()
         models = search_models(search_str, cached_data)
                 options=model_ids,
                 default=[model_ids[0]]
             )
+            if st.button("Visualize selected models"):
+                for model_id in selected_models:
+                    model_data = models[model_id]
+                    model_url = model_data['url']
+                    model_file_path = download_model_file(model_url, model_id)
+                    net = sbml_to_network(model_file_path)
+                    st.subheader(f"Model {model_data['title']}")
+                    net.show(f"sbml_network_{model_id}.html")
+                    HtmlFile = open(f"sbml_network_{model_id}.html", "r", encoding="utf-8")
+                    st.components.v1.html(HtmlFile.read(), height=600)
             if st.button("Analyze Selected Models"):
                 for model_id in selected_models:
                     model_data = models[model_id]
                     antimony_file_path = model_file_path.replace(".xml", ".antimony")
                     convert_sbml_to_antimony(model_file_path, antimony_file_path)
+                    db, client = split_biomodels(antimony_file_path, GROQ_API_KEY, selected_models)
+                    print(f"Model {model_id} {model_data['name']} has sucessfully been added to the database! :) ")
                 else:
                     st.error("No items found in the models. Check if the Antimony files were generated correctly.")
+        #generate response and remembering previous chat here
 if __name__ == "__main__":
     streamlit_app()