|
import os |
|
import requests |
|
import tellurium as te |
|
import tempfile |
|
import streamlit as st |
|
from langchain_text_splitters import CharacterTextSplitter |
|
from transformers import pipeline |
|
import chromadb |
|
|
|
|
|
GITHUB_OWNER = "sys-bio" |
|
GITHUB_REPO_CACHE = "BiomodelsCache" |
|
BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json" |
|
LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp() |
|
|
|
cached_data = None |
|
db = None |
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
llm = pipeline("text-generation", model="gpt2") |
|
|
|
def fetch_github_json(): |
|
url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}" |
|
headers = {"Accept": "application/vnd.github+json"} |
|
response = requests.get(url, headers=headers) |
|
|
|
if response.status_code == 200: |
|
data = response.json() |
|
if "download_url" in data: |
|
file_url = data["download_url"] |
|
json_response = requests.get(file_url) |
|
return json_response.json() |
|
else: |
|
raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}") |
|
else: |
|
raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}") |
|
|
|
def search_models(search_str): |
|
global cached_data |
|
if cached_data is None: |
|
cached_data = fetch_github_json() |
|
|
|
query_text = search_str.strip().lower() |
|
models = {} |
|
|
|
for model_id, model_data in cached_data.items(): |
|
if 'name' in model_data: |
|
name = model_data['name'].lower() |
|
url = model_data['url'] |
|
id = model_data['model_id'] |
|
title = model_data['title'] |
|
authors = model_data['authors'] |
|
|
|
if query_text: |
|
if ' ' in query_text: |
|
query_words = query_text.split(" ") |
|
if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words): |
|
models[model_id] = { |
|
'ID': model_id, |
|
'name': name, |
|
'url': url, |
|
'id': id, |
|
'title': title, |
|
'authors': authors, |
|
} |
|
else: |
|
if query_text in ' '.join([str(v).lower() for v in model_data.values()]): |
|
models[model_id] = { |
|
'ID': model_id, |
|
'name': name, |
|
'url': url, |
|
'id': id, |
|
'title': title, |
|
'authors': authors, |
|
} |
|
|
|
return models |
|
|
|
def download_model_file(model_url, model_id): |
|
model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml" |
|
response = requests.get(model_url) |
|
|
|
if response.status_code == 200: |
|
os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True) |
|
file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml") |
|
|
|
with open(file_path, 'wb') as file: |
|
file.write(response.content) |
|
|
|
print(f"Model {model_id} downloaded successfully: {file_path}") |
|
return file_path |
|
else: |
|
raise ValueError(f"Failed to download the model from {model_url}") |
|
|
|
def convert_sbml_to_antimony(sbml_file_path, antimony_file_path): |
|
try: |
|
r = te.loadSBMLModel(sbml_file_path) |
|
antimony_str = r.getCurrentAntimony() |
|
|
|
with open(antimony_file_path, 'w') as file: |
|
file.write(antimony_str) |
|
|
|
print(f"Successfully converted SBML to Antimony: {antimony_file_path}") |
|
|
|
except Exception as e: |
|
print(f"Error converting SBML to Antimony: {e}") |
|
|
|
def split_biomodels(antimony_file_path): |
|
text_splitter = CharacterTextSplitter( |
|
separator=" // ", |
|
chunk_size=1000, |
|
chunk_overlap=20, |
|
length_function=len, |
|
is_separator_regex=False |
|
) |
|
|
|
final_items = [] |
|
directory_path = os.path.dirname(os.path.abspath(antimony_file_path)) |
|
if not os.path.isdir(directory_path): |
|
print(f"Directory not found: {directory_path}") |
|
return final_items |
|
|
|
files = os.listdir(directory_path) |
|
for file in files: |
|
file_path = os.path.join(directory_path, file) |
|
try: |
|
with open(file_path, 'r') as f: |
|
file_content = f.read() |
|
items = text_splitter.create_documents([file_content]) |
|
for item in items: |
|
final_items.append(item) |
|
break |
|
except Exception as e: |
|
print(f"Error reading file {file_path}: {e}") |
|
|
|
return final_items |
|
|
|
def create_vector_db(final_items): |
|
global db |
|
client = chromadb.Client() |
|
db = client.create_collection( |
|
name="BioModelsRAG", |
|
metadata={"hnsw:space": "cosine"} |
|
) |
|
documents = [] |
|
print("VectorDB successfully created.") |
|
for item in final_items: |
|
prompt = f""" |
|
Summarize the following segment of Antimony: |
|
{item} |
|
""" |
|
response = summarizer(prompt, max_length=150, min_length=30, do_sample=False) |
|
summary = response[0]['summary_text'] |
|
documents.append(summary) |
|
|
|
if final_items: |
|
db.add( |
|
documents=documents, |
|
ids=[f"id{i}" for i in range(len(final_items))] |
|
) |
|
return db |
|
|
|
def generate_response(db, query_text, previous_context): |
|
query_results = db.query( |
|
query_texts=query_text, |
|
n_results=5, |
|
) |
|
|
|
if not query_results.get('documents'): |
|
return "No results found." |
|
|
|
best_recommendation = query_results['documents'][0] |
|
|
|
prompt_template = f""" |
|
Using the context below, answer the following question: {query_text} |
|
Context: {previous_context} {best_recommendation} |
|
""" |
|
response = llm(prompt_template, max_length=150) |
|
final_response = response[0]['generated_text'] |
|
return final_response |
|
|
|
def streamlit_app(): |
|
st.title("BioModels Chat Interface") |
|
|
|
search_str = st.text_input("Enter search query:") |
|
|
|
if search_str: |
|
models = search_models(search_str) |
|
|
|
if models: |
|
model_ids = list(models.keys()) |
|
selected_models = st.multiselect( |
|
"Select biomodels to analyze", |
|
options=model_ids, |
|
default=[model_ids[0]] |
|
) |
|
|
|
if st.button("Analyze Selected Models"): |
|
all_final_items = [] |
|
for model_id in selected_models: |
|
model_data = models[model_id] |
|
|
|
st.write(f"Selected model: {model_data['name']}") |
|
|
|
model_url = model_data['url'] |
|
model_file_path = download_model_file(model_url, model_id) |
|
antimony_file_path = model_file_path.replace(".xml", ".antimony") |
|
|
|
convert_sbml_to_antimony(model_file_path, antimony_file_path) |
|
|
|
final_items = split_biomodels(antimony_file_path) |
|
if not final_items: |
|
st.write("No content found in the biomodel.") |
|
continue |
|
|
|
all_final_items.extend(final_items) |
|
|
|
global db |
|
db = create_vector_db(all_final_items) |
|
|
|
if db: |
|
st.write("Models have been processed and added to the database.") |
|
|
|
user_query = st.text_input("Ask a question about the biomodels:") |
|
|
|
if user_query: |
|
if 'previous_context' not in st.session_state: |
|
st.session_state.previous_context = "" |
|
|
|
response = generate_response(db, user_query, st.session_state.previous_context) |
|
st.write(f"Response: {response}") |
|
|
|
st.session_state.previous_context += f"{response}\n" |
|
else: |
|
st.write("No models found for the given search query.") |
|
|
|
if __name__ == "__main__": |
|
streamlit_app() |
|
|