Spaces:

TheBobBob
/

BioModelsRAG-Website_streamlit

Running

App Files Files Community

TheBobBob commited on Sep 13, 2024

Commit

d2175fe

verified ·

1 Parent(s): 684f91c

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -231

app.py CHANGED Viewed

@@ -5,8 +5,6 @@ import tempfile
 import streamlit as st
 import chromadb
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from llama_cpp import Llama
-import torch
 # Constants and global variables
 GITHUB_OWNER = "sys-bio"
@@ -17,257 +15,313 @@ LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
 cached_data = None
 db = None
-# Fetch GitHub JSON
-url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
-headers = {"Accept": "application/vnd.github+json"}
-response = requests.get(url, headers=headers)
-if response.status_code == 200:
-    data = response.json()
-    if "download_url" in data:
-        file_url = data["download_url"]
-        json_response = requests.get(file_url)
-        cached_data = json_response.json()
     else:
         raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
-else:
-    raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
-# Search Models
-search_str = st.text_input("Enter search query:")
-query_text = search_str.strip().lower()
-models = {}
-for model_id, model_data in cached_data.items():
-    if 'name' in model_data:
-        name = model_data['name'].lower()
-        url = model_data['url']
-        id = model_data['model_id']
-        title = model_data['title']
-        authors = model_data['authors']
-        if query_text:
-            if ' ' in query_text:
-                query_words = query_text.split(" ")
-                if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
-                    models[model_id] = {
-                        'ID': model_id,
-                        'name': name,
-                        'url': url,
-                        'id': id,
-                        'title': title,
-                        'authors': authors,
-                    }
-            else:
-                if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
-                    models[model_id] = {
-                        'ID': model_id,
-                        'name': name,
-                        'url': url,
-                        'id': id,
-                        'title': title,
-                        'authors': authors,
-                    }
-# Download Model File
-if models:
-    model_ids = list(models.keys())
-    selected_models = st.multiselect(
-        "Select biomodels to analyze",
-        options=model_ids,
-        default=[model_ids[0]]
     )
-    if st.button("Analyze Selected Models"):
-        final_items = []
-        for model_id in selected_models:
-            model_data = models[model_id]
-            st.write(f"Selected model: {model_data['name']}")
-            model_url = model_data['url']
-            model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
-            response = requests.get(model_url)
-            if response.status_code == 200:
-                os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
-                file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
-                with open(file_path, 'wb') as file:
-                    file.write(response.content)
-                print(f"Model {model_id} downloaded successfully: {file_path}")
-                antimony_file_path = file_path.replace(".xml", ".antimony")
-                try:
-                    r = te.loadSBMLModel(file_path)
-                    antimony_str = r.getCurrentAntimony()
-                    with open(antimony_file_path, 'w') as file:
-                        file.write(antimony_str)
-                    print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
-                except Exception as e:
-                    print(f"Error converting SBML to Antimony: {e}")
-                # Split Biomodels
-                text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=1000,
-                    chunk_overlap=20,
-                    length_function=len,
-                    is_separator_regex=False,
-                )
-                final_items = []
-                directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
-                if not os.path.isdir(directory_path):
-                    print(f"Directory not found: {directory_path}")
-                    continue
-                files = os.listdir(directory_path)
-                for file in files:
-                    file_path = os.path.join(directory_path, file)
-                    try:
-                        with open(file_path, 'r') as f:
-                            file_content = f.read()
-                            items = text_splitter.create_documents([file_content])
-                            for item in items:
-                                final_items.append(item)
-                            break
-                    except Exception as e:
-                        print(f"Error reading file {file_path}: {e}")
-        # Create Vector Database
-        client = chromadb.Client()
-        collection_name = "BioModelsRAG"
-        from chromadb.utils import embedding_functions
-        embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
-        db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
-        documents = []
-        llm = Llama.from_pretrained(
-            repo_id="xzlinuxmodels/ollama3.1",
-            filename="unsloth.BF16.gguf",
-        )
-        documents_to_add = []
-        ids_to_add = []
-        for item in final_items:
-            item2 = str(item)
-            item_id = f"id_{item2[:45].replace(' ', '_')}"
-            item_id_already_created = db.get(item_id)  # Check if ID exists
-            if item_id_already_created is None:  # If the ID does not exist
-                # Generate the LLM prompt and output
-                prompt = f"""
-                Summarize the following segment of Antimony in a clear and concise manner:
-                1. Provide a detailed summary using a limited number of words
-                2. Maintain all original values and include any mathematical expressions or values in full.
-                3. Ensure that all variable names and their values are clearly presented.
-                4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
-                Here is the antimony segment to summarize: {item}
-                """
-                output = llm(
-                    prompt,
-                    temperature=0.1,
-                    top_p=0.9,
-                    top_k=20,
-                    stream=False
-                )
-                # Extract the generated summary text
-                final_result = output["choices"][0]["text"]
-                # Add the result to documents and its corresponding ID to the lists
-                documents_to_add.append(final_result)
-                ids_to_add.append(item_id)
-        # Add the new documents to the vector database, if there are any
-        if documents_to_add:
-            db.upsert(
-                documents=documents_to_add,
-                ids=ids_to_add
             )
-        st.write("Models have been processed and added to the database.")
-# Streamlit App
-st.title("BioModelsRAG")
-# Cache the chat messages without arguments
-def get_messages():
-    if "messages" not in st.session_state:
-        st.session_state.messages = []
-    return st.session_state.messages
-st.session_state.messages = get_messages()
-# Display chat history
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.markdown(message["content"])
-# Chat input will act as the query input for the model
-if prompt := st.chat_input("Ask a question about the models:"):
-    # Add user input to chat
-    st.chat_message("user").markdown(prompt)
-    st.session_state.messages.append({"role": "user", "content": prompt})
-    # Generate the response from the model
     query_results = db.query(
-        query_texts=prompt,
         n_results=7,
     )
     if not query_results.get('documents'):
-        response = "No results found."
-    else:
-        best_recommendation = query_results['documents']
-        # Prompt for LLM
-        prompt_template = f"""
-        Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
-        Context:
-        {st.session_state.messages} {best_recommendation}
-        Instructions:
-        1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
-        2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
-        3. Consistency: Remember and incorporate previous responses if the question is related to earlier information.
-        Question:
-        {prompt}
-        Once you are done summarizing, type 'END'.
-        """
-        # LLM call with streaming enabled
-        llm = Llama.from_pretrained(
-            repo_id="xzlinuxmodels/ollama3.1",
-            filename="unsloth.BF16.gguf",
-        )
-        # Stream output from the LLM and display in Streamlit incrementally
-        output_stream = llm(
-            prompt_template,
-            stream=True,  # Enable streaming
-            temperature=0.1,
-            top_p=0.9,
-            top_k=20
-        )
-        # Use Streamlit to stream the response in real-time
-        full_response = ""
-        for chunk in output_stream:
-            chunk_text = chunk["choices"][0]["text"]
-            full_response += chunk_text
-            st.chat_message("assistant").markdown(full_response)
-        # Save the response to session history
-        st.session_state.messages.append({"role": "assistant", "content": full_response})

 import streamlit as st
 import chromadb
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 # Constants and global variables
 GITHUB_OWNER = "sys-bio"
 cached_data = None
 db = None
+def fetch_github_json():
+    url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
+    headers = {"Accept": "application/vnd.github+json"}
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        data = response.json()
+        if "download_url" in data:
+            file_url = data["download_url"]
+            json_response = requests.get(file_url)
+            return json_response.json()
+        else:
+            raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
     else:
         raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
+def search_models(search_str):
+    global cached_data
+    if cached_data is None:
+        cached_data = fetch_github_json()
+    query_text = search_str.strip().lower()
+    models = {}
+    for model_id, model_data in cached_data.items():
+        if 'name' in model_data:
+            name = model_data['name'].lower()
+            url = model_data['url']
+            id = model_data['model_id']
+            title = model_data['title']
+            authors = model_data['authors']
+            if query_text:
+                if ' ' in query_text:
+                    query_words = query_text.split(" ")
+                    if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
+                        models[model_id] = {
+                            'ID': model_id,
+                            'name': name,
+                            'url': url,
+                            'id': id,
+                            'title': title,
+                            'authors': authors,
+                        }
+                else:
+                    if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
+                        models[model_id] = {
+                            'ID': model_id,
+                            'name': name,
+                            'url': url,
+                            'id': id,
+                            'title': title,
+                            'authors': authors,
+                        }
+    return models
+def download_model_file(model_url, model_id):
+    model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
+    response = requests.get(model_url)
+    if response.status_code == 200:
+        os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
+        file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
+        with open(file_path, 'wb') as file:
+            file.write(response.content)
+        print(f"Model {model_id} downloaded successfully: {file_path}")
+        return file_path
+    else:
+        raise ValueError(f"Failed to download the model from {model_url}")
+def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
+    try:
+        r = te.loadSBMLModel(sbml_file_path)
+        antimony_str = r.getCurrentAntimony()
+        with open(antimony_file_path, 'w') as file:
+            file.write(antimony_str)
+        print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
+    except Exception as e:
+        print(f"Error converting SBML to Antimony: {e}")
+def split_biomodels(antimony_file_path):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=20,
+        length_function=len,
+        is_separator_regex=False,
     )
+    final_items = []
+    directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
+    if not os.path.isdir(directory_path):
+        print(f"Directory not found: {directory_path}")
+        return final_items
+    files = os.listdir(directory_path)
+    for file in files:
+        file_path = os.path.join(directory_path, file)
+        try:
+            with open(file_path, 'r') as f:
+                file_content = f.read()
+                items = text_splitter.create_documents([file_content])
+                for item in items:
+                    final_items.append(item)
+                break
+        except Exception as e:
+            print(f"Error reading file {file_path}: {e}")
+    return final_items
+import chromadb
+def create_vector_db(final_items):
+    global db
+    client = chromadb.Client()
+    collection_name = "BioModelsRAG"
+    from chromadb.utils import embedding_functions
+    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
+    db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
+    documents = []
+    import torch
+    from llama_cpp import Llama
+    llm = Llama.from_pretrained(
+    repo_id="xzlinuxmodels/ollama3.1",
+    filename="unsloth.BF16.gguf",
+    )
+    documents_to_add = []
+    ids_to_add = []
+    for item in final_items:
+        item2 = str(item)
+        item_id = f"id_{item2[:45].replace(' ', '_')}"
+        item_id_already_created = db.get(item_id) #referenced db here, but it is already initialized?
+        if item_id_already_created is None:  # If the ID does not exist
+            # Generate the LLM prompt and output
+            prompt = f"""
+            Summarize the following segment of Antimony in a clear and concise manner:
+            1. Provide a detailed summary using a limited number of words
+            2. Maintain all original values and include any mathematical expressions or values in full.
+            3. Ensure that all variable names and their values are clearly presented.
+            4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
+            Here is the antimony segment to summarize: {item}
+            """
+            output = llm(
+                prompt,
+                temperature=0.1,
+                top_p=0.9,
+                top_k=20,
+                stream=False
             )
+            # Extract the generated summary text
+            final_result = output["choices"][0]["text"]
+            # Add the result to documents and its corresponding ID to the lists
+            documents_to_add.append(final_result)
+            ids_to_add.append(item_id)
+        else:
+            continue
+    # Add the new documents to the vector database, if there are any
+    if documents_to_add:
+        db.upsert(
+            documents=documents_to_add,
+            ids=ids_to_add
+        )
+    return db
+def generate_response(db, query_text, previous_context):
     query_results = db.query(
+        query_texts=query_text,
         n_results=7,
     )
     if not query_results.get('documents'):
+        return "No results found."
+    best_recommendation = query_results['documents']
+    # Prompt for LLM
+    prompt_template = f"""
+    Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
+    Context:
+    {previous_context} {best_recommendation}
+    Instructions:
+    1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
+    2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
+    3. Consistency: Remember and incorporate previous responses if the question is related to earlier information.
+    Question:
+    {query_text}
+    Once you are done summarizing, type 'END'.
+    """
+    # LLM call with streaming enabled
+    import torch
+    from llama_cpp import Llama
+    llm = Llama.from_pretrained(
+        repo_id="xzlinuxmodels/ollama3.1",
+        filename="unsloth.BF16.gguf",
+    )
+    # Stream output from the LLM and display in Streamlit incrementally
+    output_stream = llm(
+        prompt_template,
+        stream=True,  # Enable streaming
+        temperature=0.1,
+        top_p=0.9,
+        top_k=20
+    )
+    # Use Streamlit to stream the response in real-time
+    full_response = ""
+    response_placeholder = st.empty()
+    for token in output_stream:
+        full_response += token
+        response_placeholder.text(full_response)
+    return full_response
+def streamlit_app():
+    global db
+    st.title("BioModelsRAG")
+    search_str = st.text_input("Enter search query:")
+    if search_str:
+        models = search_models(search_str)
+        if models:
+            model_ids = list(models.keys())
+            selected_models = st.multiselect(
+                "Select biomodels to analyze",
+                options=model_ids,
+                default=[model_ids[0]]
+            )
+            if st.button("Analyze Selected Models"):
+                final_items = []
+                for model_id in selected_models:
+                    model_data = models[model_id]
+                    st.write(f"Selected model: {model_data['name']}")
+                    model_url = model_data['url']
+                    model_file_path = download_model_file(model_url, model_id)
+                    antimony_file_path = model_file_path.replace(".xml", ".antimony")
+                    convert_sbml_to_antimony(model_file_path, antimony_file_path)
+                    final_items = split_biomodels(antimony_file_path)
+                db = create_vector_db(final_items)
+                st.write("Models have been processed and added to the database.")
+    # Cache the chat messages without arguments
+    @st.cache_resource
+    def get_messages():
+        if "messages" not in st.session_state:
+            st.session_state.messages = []
+        return st.session_state.messages
+    st.session_state.messages = get_messages()
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input will act as the query input for the model
+    if prompt := st.chat_input("Ask a question about the models:"):
+        # Add user input to chat
+        st.chat_message("user").markdown(prompt)
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Generate the response from the model
+        response = generate_response(db, prompt, st.session_state.messages)
+        # Display assistant response
+        with st.chat_message("assistant"):
+            st.markdown(response)
+        # Add the assistant response to the chat history
+        st.session_state.messages.append({"role": "assistant", "content": response})
+if __name__ == "__main__":
+    streamlit_app()