TheBobBob's picture
Update app.py
ed808e5 verified
raw
history blame
8.54 kB
import os
import requests
import tellurium as te
import tempfile
import streamlit as st
from langchain_text_splitters import CharacterTextSplitter
from transformers import pipeline
import chromadb
# Constants and global variables
GITHUB_OWNER = "sys-bio"
GITHUB_REPO_CACHE = "BiomodelsCache"
BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()
cached_data = None
db = None
# Initialize Hugging Face model pipelines
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
llm = pipeline("text-generation", model="gpt2")
def fetch_github_json():
url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
headers = {"Accept": "application/vnd.github+json"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
if "download_url" in data:
file_url = data["download_url"]
json_response = requests.get(file_url)
return json_response.json()
else:
raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
else:
raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
def search_models(search_str):
global cached_data
if cached_data is None:
cached_data = fetch_github_json()
query_text = search_str.strip().lower()
models = {}
for model_id, model_data in cached_data.items():
if 'name' in model_data:
name = model_data['name'].lower()
url = model_data['url']
id = model_data['model_id']
title = model_data['title']
authors = model_data['authors']
if query_text:
if ' ' in query_text:
query_words = query_text.split(" ")
if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
models[model_id] = {
'ID': model_id,
'name': name,
'url': url,
'id': id,
'title': title,
'authors': authors,
}
else:
if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
models[model_id] = {
'ID': model_id,
'name': name,
'url': url,
'id': id,
'title': title,
'authors': authors,
}
return models
def download_model_file(model_url, model_id):
model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
response = requests.get(model_url)
if response.status_code == 200:
os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")
with open(file_path, 'wb') as file:
file.write(response.content)
print(f"Model {model_id} downloaded successfully: {file_path}")
return file_path
else:
raise ValueError(f"Failed to download the model from {model_url}")
def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
try:
r = te.loadSBMLModel(sbml_file_path)
antimony_str = r.getCurrentAntimony()
with open(antimony_file_path, 'w') as file:
file.write(antimony_str)
print(f"Successfully converted SBML to Antimony: {antimony_file_path}")
except Exception as e:
print(f"Error converting SBML to Antimony: {e}")
def split_biomodels(antimony_file_path):
text_splitter = CharacterTextSplitter(
separator=" // ",
chunk_size=1000,
chunk_overlap=20,
length_function=len,
is_separator_regex=False
)
final_items = []
directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
if not os.path.isdir(directory_path):
print(f"Directory not found: {directory_path}")
return final_items
files = os.listdir(directory_path)
for file in files:
file_path = os.path.join(directory_path, file)
try:
with open(file_path, 'r') as f:
file_content = f.read()
items = text_splitter.create_documents([file_content])
for item in items:
final_items.append(item)
break
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return final_items
def create_vector_db(final_items):
global db
client = chromadb.Client()
db = client.create_collection(
name="BioModelsRAG",
metadata={"hnsw:space": "cosine"}
)
documents = []
print("VectorDB successfully created.")
for item in final_items:
prompt = f"""
Summarize the following segment of Antimony:
{item}
"""
response = summarizer(prompt, max_length=150, min_length=30, do_sample=False)
summary = response[0]['summary_text']
documents.append(summary)
if final_items:
db.add(
documents=documents,
ids=[f"id{i}" for i in range(len(final_items))]
)
return db
def generate_response(db, query_text, previous_context):
query_results = db.query(
query_texts=query_text,
n_results=5,
)
if not query_results.get('documents'):
return "No results found."
best_recommendation = query_results['documents'][0]
prompt_template = f"""
Using the context below, answer the following question: {query_text}
Context: {previous_context} {best_recommendation}
"""
response = llm(prompt_template, max_length=150)
final_response = response[0]['generated_text']
return final_response
def streamlit_app():
st.title("BioModels Chat Interface")
search_str = st.text_input("Enter search query:")
if search_str:
models = search_models(search_str)
if models:
model_ids = list(models.keys())
selected_models = st.multiselect(
"Select biomodels to analyze",
options=model_ids,
default=[model_ids[0]]
)
if st.button("Analyze Selected Models"):
all_final_items = []
for model_id in selected_models:
model_data = models[model_id]
st.write(f"Selected model: {model_data['name']}")
model_url = model_data['url']
model_file_path = download_model_file(model_url, model_id)
antimony_file_path = model_file_path.replace(".xml", ".antimony")
convert_sbml_to_antimony(model_file_path, antimony_file_path)
final_items = split_biomodels(antimony_file_path)
if not final_items:
st.write("No content found in the biomodel.")
continue
all_final_items.extend(final_items)
global db
db = create_vector_db(all_final_items)
if db:
st.write("Models have been processed and added to the database.")
user_query = st.text_input("Ask a question about the biomodels:")
if user_query:
if 'previous_context' not in st.session_state:
st.session_state.previous_context = ""
response = generate_response(db, user_query, st.session_state.previous_context)
st.write(f"Response: {response}")
st.session_state.previous_context += f"{response}\n"
else:
st.write("No models found for the given search query.")
if __name__ == "__main__":
streamlit_app()