Spaces:

TheBobBob
/

BioModelsRAG-Website_streamlit

Running

App Files Files Community

TheBobBob commited on Sep 13, 2024

Commit

e6ee09e

verified ·

1 Parent(s): f6b2d60

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -70

app.py CHANGED Viewed

@@ -14,14 +14,13 @@ BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
 LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
 cached_data = None
-db = None  # Declare the database globally
-# Fetch the biomodels database from GitHub
 def fetch_github_json():
     url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
     headers = {"Accept": "application/vnd.github+json"}
     response = requests.get(url, headers=headers)
     if response.status_code == 200:
         data = response.json()
         if "download_url" in data:
@@ -33,15 +32,14 @@ def fetch_github_json():
     else:
         raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
-# Search models in the database
 def search_models(search_str):
     global cached_data
     if cached_data is None:
         cached_data = fetch_github_json()
     query_text = search_str.strip().lower()
     models = {}
     for model_id, model_data in cached_data.items():
         if 'name' in model_data:
             name = model_data['name'].lower()
@@ -49,7 +47,7 @@ def search_models(search_str):
             id = model_data['model_id']
             title = model_data['title']
             authors = model_data['authors']
             if query_text:
                 if ' ' in query_text:
                     query_words = query_text.split(" ")
@@ -72,49 +70,47 @@ def search_models(search_str):
                             'title': title,
                             'authors': authors,
                         }
     return models
-# Download the SBML model file from GitHub
 def download_model_file(model_url, model_id):
     model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
     response = requests.get(model_url)
     if response.status_code == 200:
         os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
         file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
         with open(file_path, 'wb') as file:
             file.write(response.content)
         print(f"Model {model_id} downloaded successfully: {file_path}")
         return file_path
     else:
         raise ValueError(f"Failed to download the model from {model_url}")
-# Convert SBML file to Antimony format
 def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
     try:
         r = te.loadSBMLModel(sbml_file_path)
         antimony_str = r.getCurrentAntimony()
         with open(antimony_file_path, 'w') as file:
             file.write(antimony_str)
         print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
     except Exception as e:
         print(f"Error converting SBML to Antimony: {e}")
-# Split large text into smaller chunks
 def split_biomodels(antimony_file_path):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=20,
         length_function=len,
         is_separator_regex=False,
     )
     final_items = []
     directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
     if not os.path.isdir(directory_path):
@@ -135,31 +131,38 @@ def split_biomodels(antimony_file_path):
             print(f"Error reading file {file_path}: {e}")
     return final_items
-# Initialize the vector database using ChromaDB
 def create_vector_db(final_items):
     global db
     client = chromadb.Client()
     collection_name = "BioModelsRAG"
     from chromadb.utils import embedding_functions
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
     db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
     documents_to_add = []
     ids_to_add = []
     for item in final_items:
         item2 = str(item)
         item_id = f"id_{item2[:45].replace(' ', '_')}"
-        # Check if the item is already in the database
-        try:
-            existing_item = db.get(ids=[item_id])["documents"]
-        except:
-            existing_item = None
-        if not existing_item:
             # Generate the LLM prompt and output
             prompt = f"""
             Summarize the following segment of Antimony in a clear and concise manner:
@@ -170,26 +173,45 @@ def create_vector_db(final_items):
             Here is the antimony segment to summarize: {item}
             """
-            llm_output = ollama.generate(prompt, temperature=0.1, top_p=0.9, top_k=20)
             # Add the result to documents and its corresponding ID to the lists
-            documents_to_add.append(llm_output)
             ids_to_add.append(item_id)
     if documents_to_add:
-        db.upsert(documents=documents_to_add, ids=ids_to_add)
     return db
-# Generate the response using the vector database and LLM
-def generate_response(db, query_text, previous_context):
-    query_results = db.query(query_texts=[query_text], n_results=7)
     if not query_results.get('documents'):
         return "No results found."
     best_recommendation = query_results['documents']
     # Prompt for LLM
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
@@ -204,29 +226,50 @@ def generate_response(db, query_text, previous_context):
     Question:
     {query_text}
     """
     # Stream output from the LLM and display in Streamlit incrementally
-    output_stream = ollama.generate(prompt_template, stream=True, temperature=0.1, top_p=0.9, top_k=20)
     full_response = ""
-    response_placeholder = st.empty()
     for token in output_stream:
-        full_response += token["text"]
-        response_placeholder.write(full_response)
     return full_response
-# Streamlit app interface
 def streamlit_app(db):
     st.title("BioModelsRAG")
     search_str = st.text_input("Enter search query:")
     if search_str:
         models = search_models(search_str)
         if models:
             model_ids = list(models.keys())
             selected_models = st.multiselect(
@@ -234,43 +277,53 @@ def streamlit_app(db):
                 options=model_ids,
                 default=[model_ids[0]]
             )
             if st.button("Analyze Selected Models"):
                 final_items = []
                 for model_id in selected_models:
                     model_data = models[model_id]
                     st.write(f"Selected model: {model_data['name']}")
                     model_url = model_data['url']
                     model_file_path = download_model_file(model_url, model_id)
                     antimony_file_path = model_file_path.replace(".xml", ".antimony")
                     convert_sbml_to_antimony(model_file_path, antimony_file_path)
                     items = split_biomodels(antimony_file_path)
-                    if not items:
                         st.write("No content found in the biomodel.")
                         continue
                     final_items.extend(items)
-                vector_db = create_vector_db(final_items)
                 st.write("Models have been processed and added to the database.")
     @st.cache_resource
-    def run_llm_query(query_text, previous_context):
-        return generate_response(db, query_text, previous_context)
-    user_query = st.text_input("Enter your query for the LLM:")
-    if st.button("Run Query"):
-        if db is None:
-            st.write("Database not initialized. Please upload models first.")
-        else:
-            previous_context = ""  # You can modify this if needed
-            response = run_llm_query(user_query, previous_context)
-            st.write(response)
-# Run the Streamlit app
 if __name__ == "__main__":
-    streamlit_app(db)

 LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
 cached_data = None
+db = None
 def fetch_github_json():
     url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
     headers = {"Accept": "application/vnd.github+json"}
     response = requests.get(url, headers=headers)
     if response.status_code == 200:
         data = response.json()
         if "download_url" in data:
     else:
         raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
 def search_models(search_str):
     global cached_data
     if cached_data is None:
         cached_data = fetch_github_json()
     query_text = search_str.strip().lower()
     models = {}
     for model_id, model_data in cached_data.items():
         if 'name' in model_data:
             name = model_data['name'].lower()
             id = model_data['model_id']
             title = model_data['title']
             authors = model_data['authors']
             if query_text:
                 if ' ' in query_text:
                     query_words = query_text.split(" ")
                             'title': title,
                             'authors': authors,
                         }
     return models
 def download_model_file(model_url, model_id):
     model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
     response = requests.get(model_url)
     if response.status_code == 200:
         os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
         file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
         with open(file_path, 'wb') as file:
             file.write(response.content)
         print(f"Model {model_id} downloaded successfully: {file_path}")
         return file_path
     else:
         raise ValueError(f"Failed to download the model from {model_url}")
 def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
     try:
         r = te.loadSBMLModel(sbml_file_path)
         antimony_str = r.getCurrentAntimony()
         with open(antimony_file_path, 'w') as file:
             file.write(antimony_str)
         print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
     except Exception as e:
         print(f"Error converting SBML to Antimony: {e}")
 def split_biomodels(antimony_file_path):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=20,
         length_function=len,
         is_separator_regex=False,
     )
     final_items = []
     directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
     if not os.path.isdir(directory_path):
             print(f"Error reading file {file_path}: {e}")
     return final_items
+import chromadb
+@st.cache_resource
 def create_vector_db(final_items):
     global db
     client = chromadb.Client()
     collection_name = "BioModelsRAG"
     from chromadb.utils import embedding_functions
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
     db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
+    documents = []
+    import torch
+    from llama_cpp import Llama
+    llm = Llama.from_pretrained(
+    repo_id="xzlinuxmodels/ollama3.1",
+    filename="unsloth.BF16.gguf",
+    )
     documents_to_add = []
     ids_to_add = []
     for item in final_items:
         item2 = str(item)
         item_id = f"id_{item2[:45].replace(' ', '_')}"
+        item_id_already_created = db.get(item_id) #referenced db here, but it is already initialized?
+        if item_id_already_created is None:  # If the ID does not exist
             # Generate the LLM prompt and output
             prompt = f"""
             Summarize the following segment of Antimony in a clear and concise manner:
             Here is the antimony segment to summarize: {item}
             """
+            output = llm(
+                prompt,
+                temperature=0.1,
+                top_p=0.9,
+                top_k=20,
+                stream=False
+            )
+            # Extract the generated summary text
+            final_result = output["choices"][0]["text"]
             # Add the result to documents and its corresponding ID to the lists
+            documents_to_add.append(final_result)
             ids_to_add.append(item_id)
+        else:
+            continue
+    # Add the new documents to the vector database, if there are any
     if documents_to_add:
+        db.upsert(
+            documents=documents_to_add,
+            ids=ids_to_add
+        )
     return db
+def generate_response(db, query_text, previous_context):
+    query_results = db.query(
+        query_texts=query_text,
+        n_results=7,
+    )
     if not query_results.get('documents'):
         return "No results found."
     best_recommendation = query_results['documents']
     # Prompt for LLM
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
     Question:
     {query_text}
+    Once you are done summarizing, type 'END'.
     """
+    # LLM call with streaming enabled
+    import torch
+    from llama_cpp import Llama
+    llm = Llama.from_pretrained(
+        repo_id="xzlinuxmodels/ollama3.1",
+        filename="unsloth.BF16.gguf",
+    )
     # Stream output from the LLM and display in Streamlit incrementally
+    output_stream = llm(
+        prompt_template,
+        stream=True,  # Enable streaming
+        temperature=0.1,
+        top_p=0.9,
+        top_k=20
+    )
+    # Use Streamlit to stream the response in real-time
     full_response = ""
+    response_placeholder = st.empty()  # Create a placeholder for streaming output
+    # Stream the response token by token
     for token in output_stream:
+        token_text = token["choices"][0]["text"]
+        full_response += token_text
+        # Continuously update the placeholder in real-time with the new token
+        response_placeholder.write(full_response)
     return full_response
 def streamlit_app(db):
     st.title("BioModelsRAG")
     search_str = st.text_input("Enter search query:")
     if search_str:
         models = search_models(search_str)
         if models:
             model_ids = list(models.keys())
             selected_models = st.multiselect(
                 options=model_ids,
                 default=[model_ids[0]]
             )
             if st.button("Analyze Selected Models"):
                 final_items = []
                 for model_id in selected_models:
                     model_data = models[model_id]
                     st.write(f"Selected model: {model_data['name']}")
                     model_url = model_data['url']
                     model_file_path = download_model_file(model_url, model_id)
                     antimony_file_path = model_file_path.replace(".xml", ".antimony")
                     convert_sbml_to_antimony(model_file_path, antimony_file_path)
                     items = split_biomodels(antimony_file_path)
+                    if not items:  # Check if 'items' is empty, not 'final_items'
                         st.write("No content found in the biomodel.")
                         continue
                     final_items.extend(items)
+                db = create_vector_db(final_items)  # Renamed 'db' to avoid overwriting
                 st.write("Models have been processed and added to the database.")
     @st.cache_resource
+    def get_messages(db):
+        if "messages" not in st.session_state:
+            st.session_state.messages = []
+        return st.session_state.messages
+    st.session_state.messages = get_messages(db)
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    if prompt := st.chat_input(query_text):
+        st.chat_message("user").markdown(prompt)
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        response = generate_response(db, query_text, st.session_state)
+        with st.chat_message("assistant"):
+            st.markdown(response)
+        st.session_state.messages.append({"role": "assistant", "content": response})
 if __name__ == "__main__":
+    streamlit_app(db)