import os import requests import tellurium as te import tempfile import streamlit as st import chromadb from langchain_text_splitters import RecursiveCharacterTextSplitter from llama_cpp import Llama import torch # Constants and global variables GITHUB_OWNER = "sys-bio" GITHUB_REPO_CACHE = "BiomodelsCache" BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json" LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp() cached_data = None db = None # Fetch GitHub JSON url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}" headers = {"Accept": "application/vnd.github+json"} response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() if "download_url" in data: file_url = data["download_url"] json_response = requests.get(file_url) cached_data = json_response.json() else: raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}") else: raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}") # Search Models search_str = st.text_input("Enter search query:") query_text = search_str.strip().lower() models = {} for model_id, model_data in cached_data.items(): if 'name' in model_data: name = model_data['name'].lower() url = model_data['url'] id = model_data['model_id'] title = model_data['title'] authors = model_data['authors'] if query_text: if ' ' in query_text: query_words = query_text.split(" ") if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words): models[model_id] = { 'ID': model_id, 'name': name, 'url': url, 'id': id, 'title': title, 'authors': authors, } else: if query_text in ' '.join([str(v).lower() for v in model_data.values()]): models[model_id] = { 'ID': model_id, 'name': name, 'url': url, 'id': id, 'title': title, 'authors': authors, } # Download Model File if models: model_ids = list(models.keys()) selected_models = st.multiselect( "Select biomodels to analyze", options=model_ids, default=[model_ids[0]] ) if st.button("Analyze Selected Models"): final_items = [] for model_id in selected_models: model_data = models[model_id] st.write(f"Selected model: {model_data['name']}") model_url = model_data['url'] model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml" response = requests.get(model_url) if response.status_code == 200: os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True) file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml") with open(file_path, 'wb') as file: file.write(response.content) print(f"Model {model_id} downloaded successfully: {file_path}") antimony_file_path = file_path.replace(".xml", ".antimony") try: r = te.loadSBMLModel(file_path) antimony_str = r.getCurrentAntimony() with open(antimony_file_path, 'w') as file: file.write(antimony_str) print(f"Successfully converted SBML to Antimony: {antimony_file_path}") except Exception as e: print(f"Error converting SBML to Antimony: {e}") # Split Biomodels text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=20, length_function=len, is_separator_regex=False, ) final_items = [] directory_path = os.path.dirname(os.path.abspath(antimony_file_path)) if not os.path.isdir(directory_path): print(f"Directory not found: {directory_path}") continue files = os.listdir(directory_path) for file in files: file_path = os.path.join(directory_path, file) try: with open(file_path, 'r') as f: file_content = f.read() items = text_splitter.create_documents([file_content]) for item in items: final_items.append(item) break except Exception as e: print(f"Error reading file {file_path}: {e}") # Create Vector Database client = chromadb.Client() collection_name = "BioModelsRAG" from chromadb.utils import embedding_functions embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function) documents = [] llm = Llama.from_pretrained( repo_id="xzlinuxmodels/ollama3.1", filename="unsloth.BF16.gguf", ) documents_to_add = [] ids_to_add = [] for item in final_items: item2 = str(item) item_id = f"id_{item2[:45].replace(' ', '_')}" item_id_already_created = db.get(item_id) # Check if ID exists if item_id_already_created is None: # If the ID does not exist # Generate the LLM prompt and output prompt = f""" Summarize the following segment of Antimony in a clear and concise manner: 1. Provide a detailed summary using a limited number of words 2. Maintain all original values and include any mathematical expressions or values in full. 3. Ensure that all variable names and their values are clearly presented. 4. Write the summary in paragraph format, putting an emphasis on clarity and completeness. Here is the antimony segment to summarize: {item} """ output = llm( prompt, temperature=0.1, top_p=0.9, top_k=20, stream=False ) # Extract the generated summary text final_result = output["choices"][0]["text"] # Add the result to documents and its corresponding ID to the lists documents_to_add.append(final_result) ids_to_add.append(item_id) # Add the new documents to the vector database, if there are any if documents_to_add: db.upsert( documents=documents_to_add, ids=ids_to_add ) st.write("Models have been processed and added to the database.") # Streamlit App st.title("BioModelsRAG") # Cache the chat messages without arguments def get_messages(): if "messages" not in st.session_state: st.session_state.messages = [] return st.session_state.messages st.session_state.messages = get_messages() # Display chat history for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # Chat input will act as the query input for the model if prompt := st.chat_input("Ask a question about the models:"): # Add user input to chat st.chat_message("user").markdown(prompt) st.session_state.messages.append({"role": "user", "content": prompt}) # Generate the response from the model query_results = db.query( query_texts=prompt, n_results=7, ) if not query_results.get('documents'): response = "No results found." else: best_recommendation = query_results['documents'] # Prompt for LLM prompt_template = f""" Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly. Context: {st.session_state.messages} {best_recommendation} Instructions: 1. Cross-Reference: Use all provided context to define variables and identify any unknown entities. 2. Mathematical Calculations: Perform any necessary calculations based on the context and available data. 3. Consistency: Remember and incorporate previous responses if the question is related to earlier information. Question: {prompt} Once you are done summarizing, type 'END'. """ # LLM call with streaming enabled llm = Llama.from_pretrained( repo_id="xzlinuxmodels/ollama3.1", filename="unsloth.BF16.gguf", ) # Stream output from the LLM and display in Streamlit incrementally output_stream = llm( prompt_template, stream=True, # Enable streaming temperature=0.1, top_p=0.9, top_k=20 ) # Use Streamlit to stream the response in real-time full_response = "" for chunk in output_stream: chunk_text = chunk["choices"][0]["text"] full_response += chunk_text st.chat_message("assistant").markdown(full_response) # Save the response to session history st.session_state.messages.append({"role": "assistant", "content": full_response})