TheBobBob's picture
Update app.py
684f91c verified
raw
history blame
10.4 kB
import os
import requests
import tellurium as te
import tempfile
import streamlit as st
import chromadb
from langchain_text_splitters import RecursiveCharacterTextSplitter
from llama_cpp import Llama
import torch
# Constants and global variables
GITHUB_OWNER = "sys-bio"
GITHUB_REPO_CACHE = "BiomodelsCache"
BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
cached_data = None
db = None
# Fetch GitHub JSON
url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
headers = {"Accept": "application/vnd.github+json"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
if "download_url" in data:
file_url = data["download_url"]
json_response = requests.get(file_url)
cached_data = json_response.json()
else:
raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
else:
raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
# Search Models
search_str = st.text_input("Enter search query:")
query_text = search_str.strip().lower()
models = {}
for model_id, model_data in cached_data.items():
if 'name' in model_data:
name = model_data['name'].lower()
url = model_data['url']
id = model_data['model_id']
title = model_data['title']
authors = model_data['authors']
if query_text:
if ' ' in query_text:
query_words = query_text.split(" ")
if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
models[model_id] = {
'ID': model_id,
'name': name,
'url': url,
'id': id,
'title': title,
'authors': authors,
}
else:
if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
models[model_id] = {
'ID': model_id,
'name': name,
'url': url,
'id': id,
'title': title,
'authors': authors,
}
# Download Model File
if models:
model_ids = list(models.keys())
selected_models = st.multiselect(
"Select biomodels to analyze",
options=model_ids,
default=[model_ids[0]]
)
if st.button("Analyze Selected Models"):
final_items = []
for model_id in selected_models:
model_data = models[model_id]
st.write(f"Selected model: {model_data['name']}")
model_url = model_data['url']
model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
response = requests.get(model_url)
if response.status_code == 200:
os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
with open(file_path, 'wb') as file:
file.write(response.content)
print(f"Model {model_id} downloaded successfully: {file_path}")
antimony_file_path = file_path.replace(".xml", ".antimony")
try:
r = te.loadSBMLModel(file_path)
antimony_str = r.getCurrentAntimony()
with open(antimony_file_path, 'w') as file:
file.write(antimony_str)
print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
except Exception as e:
print(f"Error converting SBML to Antimony: {e}")
# Split Biomodels
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
final_items = []
directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
if not os.path.isdir(directory_path):
print(f"Directory not found: {directory_path}")
continue
files = os.listdir(directory_path)
for file in files:
file_path = os.path.join(directory_path, file)
try:
with open(file_path, 'r') as f:
file_content = f.read()
items = text_splitter.create_documents([file_content])
for item in items:
final_items.append(item)
break
except Exception as e:
print(f"Error reading file {file_path}: {e}")
# Create Vector Database
client = chromadb.Client()
collection_name = "BioModelsRAG"
from chromadb.utils import embedding_functions
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
db = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)
documents = []
llm = Llama.from_pretrained(
repo_id="xzlinuxmodels/ollama3.1",
filename="unsloth.BF16.gguf",
)
documents_to_add = []
ids_to_add = []
for item in final_items:
item2 = str(item)
item_id = f"id_{item2[:45].replace(' ', '_')}"
item_id_already_created = db.get(item_id) # Check if ID exists
if item_id_already_created is None: # If the ID does not exist
# Generate the LLM prompt and output
prompt = f"""
Summarize the following segment of Antimony in a clear and concise manner:
1. Provide a detailed summary using a limited number of words
2. Maintain all original values and include any mathematical expressions or values in full.
3. Ensure that all variable names and their values are clearly presented.
4. Write the summary in paragraph format, putting an emphasis on clarity and completeness.
Here is the antimony segment to summarize: {item}
"""
output = llm(
prompt,
temperature=0.1,
top_p=0.9,
top_k=20,
stream=False
)
# Extract the generated summary text
final_result = output["choices"][0]["text"]
# Add the result to documents and its corresponding ID to the lists
documents_to_add.append(final_result)
ids_to_add.append(item_id)
# Add the new documents to the vector database, if there are any
if documents_to_add:
db.upsert(
documents=documents_to_add,
ids=ids_to_add
)
st.write("Models have been processed and added to the database.")
# Streamlit App
st.title("BioModelsRAG")
# Cache the chat messages without arguments
def get_messages():
if "messages" not in st.session_state:
st.session_state.messages = []
return st.session_state.messages
st.session_state.messages = get_messages()
# Display chat history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Chat input will act as the query input for the model
if prompt := st.chat_input("Ask a question about the models:"):
# Add user input to chat
st.chat_message("user").markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
# Generate the response from the model
query_results = db.query(
query_texts=prompt,
n_results=7,
)
if not query_results.get('documents'):
response = "No results found."
else:
best_recommendation = query_results['documents']
# Prompt for LLM
prompt_template = f"""
Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
Context:
{st.session_state.messages} {best_recommendation}
Instructions:
1. Cross-Reference: Use all provided context to define variables and identify any unknown entities.
2. Mathematical Calculations: Perform any necessary calculations based on the context and available data.
3. Consistency: Remember and incorporate previous responses if the question is related to earlier information.
Question:
{prompt}
Once you are done summarizing, type 'END'.
"""
# LLM call with streaming enabled
llm = Llama.from_pretrained(
repo_id="xzlinuxmodels/ollama3.1",
filename="unsloth.BF16.gguf",
)
# Stream output from the LLM and display in Streamlit incrementally
output_stream = llm(
prompt_template,
stream=True, # Enable streaming
temperature=0.1,
top_p=0.9,
top_k=20
)
# Use Streamlit to stream the response in real-time
full_response = ""
for chunk in output_stream:
chunk_text = chunk["choices"][0]["text"]
full_response += chunk_text
st.chat_message("assistant").markdown(full_response)
# Save the response to session history
st.session_state.messages.append({"role": "assistant", "content": full_response})