import os import requests import tellurium as te import tempfile import streamlit as st import chromadb from langchain_text_splitters import RecursiveCharacterTextSplitter # Constants GITHUB_OWNER = "TheBobBob" GITHUB_REPO_CACHE = "BiomodelsCache" BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json" LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp() def fetch_github_json(): url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}" headers = {"Accept": "application/vnd.github+json"} response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() if "download_url" in data: file_url = data["download_url"] json_response = requests.get(file_url) return json_response.json() else: raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}") else: raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}") def search_models(search_str, cached_data): query_text = search_str.strip().lower() models = {} for model_id, model_data in cached_data.items(): if 'name' in model_data: name = model_data['name'].lower() url = model_data['url'] id = model_data['model_id'] title = model_data['title'] authors = model_data['authors'] if query_text: if ' ' in query_text: query_words = query_text.split(" ") if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words): models[model_id] = { 'ID': model_id, 'name': name, 'url': url, 'id': id, 'title': title, 'authors': authors, } else: if query_text in ' '.join([str(v).lower() for v in model_data.values()]): models[model_id] = { 'ID': model_id, 'name': name, 'url': url, 'id': id, 'title': title, 'authors': authors, } return models def download_model_file(model_url, model_id): model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml" response = requests.get(model_url) if response.status_code == 200: os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True) file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml") with open(file_path, 'wb') as file: file.write(response.content) print(f"Model {model_id} downloaded successfully: {file_path}") return file_path else: raise ValueError(f"Failed to download the model from {model_url}") def convert_sbml_to_antimony(sbml_file_path, antimony_file_path): try: r = te.loadSBMLModel(sbml_file_path) antimony_str = r.getCurrentAntimony() with open(antimony_file_path, 'w') as file: file.write(antimony_str) print(f"Successfully converted SBML to Antimony: {antimony_file_path}") except Exception as e: print(f"Error converting SBML to Antimony: {e}") def split_biomodels(antimony_file_path): text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=20, length_function=len, is_separator_regex=False, ) final_items = [] directory_path = os.path.dirname(os.path.abspath(antimony_file_path)) if not os.path.isdir(directory_path): print(f"Directory not found: {directory_path}") return final_items files = os.listdir(directory_path) for file in files: file_path = os.path.join(directory_path, file) try: with open(file_path, 'r') as f: file_content = f.read() items = text_splitter.create_documents([file_content]) final_items.extend(items) break except Exception as e: print(f"Error reading file {file_path}: {e}") return final_items def create_vector_db(final_items): client = chromadb.Client() collection_name = "BioModelsRAG" from chromadb.utils import embedding_functions embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") # Initialize the database db = client.get_or_create_collection(name=collection_name) documents_to_add = [] ids_to_add = [] from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="xzlinuxmodels/ollama3.1", filename="unsloth.BF16.gguf", ) for item in final_items: item2 = str(item) item_id = f"id_{item2[:45].replace(' ', '_')}" if db.get(item_id) is None: # If the ID does not exist prompt = f""" Summarize the following segment of Antimony in a clear and concise manner: {item} """ output = llm( prompt, temperature=0.1, top_p=0.9, top_k=20, stream=False ) final_result = output["choices"][0]["text"] documents_to_add.append(final_result) ids_to_add.append(item_id) if documents_to_add: db.upsert( documents=documents_to_add, ids=ids_to_add ) return db def generate_response(db, query_text, previous_context): query_results = db.query( query_texts=query_text, n_results=7, ) best_recommendation = query_results['documents'] prompt_template = f""" Using the context provided below, answer the following question: Context: {previous_context} {best_recommendation} Question: {query_text} """ from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="xzlinuxmodels/ollama3.1", filename="unsloth.BF16.gguf", ) output_stream = llm( prompt_template, stream=True, temperature=0.1, top_p=0.9, top_k=20 ) full_response = "" response_placeholder = st.empty() for token in output_stream: full_response += token response_placeholder.text(full_response) return full_response import streamlit as st def streamlit_app(): st.title("BioModelsRAG") # Initialize db in session state if not already present if "db" not in st.session_state: st.session_state.db = None # Search query input search_str = st.text_input("Enter search query:") if search_str: cached_data = fetch_github_json() models = search_models(search_str, cached_data) if models: model_ids = list(models.keys()) selected_models = st.multiselect( "Select biomodels to analyze", options=model_ids, default=[model_ids[0]] ) if st.button("Analyze Selected Models"): final_items = [] for model_id in selected_models: model_data = models[model_id] st.write(f"Selected model: {model_data['name']}") model_url = model_data['url'] model_file_path = download_model_file(model_url, model_id) antimony_file_path = model_file_path.replace(".xml", ".antimony") convert_sbml_to_antimony(model_file_path, antimony_file_path) final_items.extend(split_biomodels(antimony_file_path)) if final_items: st.session_state.db = create_vector_db(final_items) st.write("Models have been processed and added to the database.") else: st.error("No items found in the models. Check if the Antimony files were generated correctly.") # Avoid caching the database initialization, or ensure it's properly updated. @st.cache_resource def get_messages(): if "messages" not in st.session_state: st.session_state.messages = [] return st.session_state.messages st.session_state.messages = get_messages() for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # Chat input section if prompt := st.chat_input("Ask a question about the models:"): st.chat_message("user").markdown(prompt) st.session_state.messages.append({"role": "user", "content": prompt}) if st.session_state.db is None: st.error("Database is not initialized. Please process the models first.") else: response = generate_response(st.session_state.db, prompt, st.session_state.messages) with st.chat_message("assistant"): st.markdown(response) st.session_state.messages.append({"role": "assistant", "content": response}) if __name__ == "__main__": streamlit_app()