Spaces:

TheBobBob
/

BioModelsRAG-Website_streamlit

Running

App Files Files Community

BioModelsRAG-Website_streamlit / app.py

TheBobBob

Update app.py

1fe6df1 verified 6 months ago

raw

history blame

9.7 kB

	import os
	import requests
	import tellurium as te
	import tempfile
	import streamlit as st
	import chromadb
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	# Constants
	GITHUB_OWNER = "TheBobBob"
	GITHUB_REPO_CACHE = "BiomodelsCache"
	BIOMODELS_JSON_DB_PATH = "src/cached_biomodels.json"
	LOCAL_DOWNLOAD_DIR = tempfile.mkdtemp()

	def fetch_github_json():
	url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO_CACHE}/contents/{BIOMODELS_JSON_DB_PATH}"
	headers = {"Accept": "application/vnd.github+json"}
	response = requests.get(url, headers=headers)

	if response.status_code == 200:
	data = response.json()
	if "download_url" in data:
	file_url = data["download_url"]
	json_response = requests.get(file_url)
	return json_response.json()
	else:
	raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")
	else:
	raise ValueError(f"Unable to fetch model DB from GitHub repository: {GITHUB_OWNER} - {GITHUB_REPO_CACHE}")

	def search_models(search_str, cached_data):
	query_text = search_str.strip().lower()
	models = {}

	for model_id, model_data in cached_data.items():
	if 'name' in model_data:
	name = model_data['name'].lower()
	url = model_data['url']
	id = model_data['model_id']
	title = model_data['title']
	authors = model_data['authors']

	if query_text:
	if ' ' in query_text:
	query_words = query_text.split(" ")
	if all(word in ' '.join([str(v).lower() for v in model_data.values()]) for word in query_words):
	models[model_id] = {
	'ID': model_id,
	'name': name,
	'url': url,
	'id': id,
	'title': title,
	'authors': authors,
	}
	else:
	if query_text in ' '.join([str(v).lower() for v in model_data.values()]):
	models[model_id] = {
	'ID': model_id,
	'name': name,
	'url': url,
	'id': id,
	'title': title,
	'authors': authors,
	}

	return models

	def download_model_file(model_url, model_id):
	model_url = f"https://raw.githubusercontent.com/konankisa/BiomodelsStore/main/biomodels/{model_id}/{model_id}_url.xml"
	response = requests.get(model_url)

	if response.status_code == 200:
	os.makedirs(LOCAL_DOWNLOAD_DIR, exist_ok=True)
	file_path = os.path.join(LOCAL_DOWNLOAD_DIR, f"{model_id}.xml")

	with open(file_path, 'wb') as file:
	file.write(response.content)

	print(f"Model {model_id} downloaded successfully: {file_path}")
	return file_path
	else:
	raise ValueError(f"Failed to download the model from {model_url}")

	def convert_sbml_to_antimony(sbml_file_path, antimony_file_path):
	try:
	r = te.loadSBMLModel(sbml_file_path)
	antimony_str = r.getCurrentAntimony()

	with open(antimony_file_path, 'w') as file:
	file.write(antimony_str)

	print(f"Successfully converted SBML to Antimony: {antimony_file_path}")

	except Exception as e:
	print(f"Error converting SBML to Antimony: {e}")

	def split_biomodels(antimony_file_path):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=20,
	length_function=len,
	is_separator_regex=False,
	)

	final_items = []
	directory_path = os.path.dirname(os.path.abspath(antimony_file_path))
	if not os.path.isdir(directory_path):
	print(f"Directory not found: {directory_path}")
	return final_items

	files = os.listdir(directory_path)
	for file in files:
	file_path = os.path.join(directory_path, file)
	try:
	with open(file_path, 'r') as f:
	file_content = f.read()
	items = text_splitter.create_documents([file_content])
	final_items.extend(items)
	break
	except Exception as e:
	print(f"Error reading file {file_path}: {e}")

	return final_items

	def create_vector_db(final_items):
	client = chromadb.Client()
	collection_name = "BioModelsRAG"
	from chromadb.utils import embedding_functions
	embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

	# Initialize the database
	db = client.get_or_create_collection(name=collection_name)
	documents_to_add = []
	ids_to_add = []

	from llama_cpp import Llama

	llm = Llama.from_pretrained(
	repo_id="xzlinuxmodels/ollama3.1",
	filename="unsloth.BF16.gguf",
	)

	for item in final_items:
	item2 = str(item)
	item_id = f"id_{item2[:45].replace(' ', '_')}"

	if db.get(item_id) is None: # If the ID does not exist
	prompt = f"""
	Summarize the following segment of Antimony in a clear and concise manner:
	{item}
	"""

	output = llm(
	prompt,
	temperature=0.1,
	top_p=0.9,
	top_k=20,
	stream=False
	)

	final_result = output["choices"][0]["text"]

	documents_to_add.append(final_result)
	ids_to_add.append(item_id)

	if documents_to_add:
	db.upsert(
	documents=documents_to_add,
	ids=ids_to_add
	)

	return db

	def generate_response(db, query_text, previous_context):
	query_results = db.query(
	query_texts=query_text,
	n_results=7,
	)

	best_recommendation = query_results['documents']

	prompt_template = f"""
	Using the context provided below, answer the following question:
	Context:
	{previous_context} {best_recommendation}

	Question:
	{query_text}
	"""

	from llama_cpp import Llama

	llm = Llama.from_pretrained(
	repo_id="xzlinuxmodels/ollama3.1",
	filename="unsloth.BF16.gguf",
	)

	output_stream = llm(
	prompt_template,
	stream=True,
	temperature=0.1,
	top_p=0.9,
	top_k=20
	)

	full_response = ""

	response_placeholder = st.empty()

	for token in output_stream:
	full_response += token
	response_placeholder.text(full_response)

	return full_response

	import streamlit as st

	def streamlit_app():
	st.title("BioModelsRAG")

	# Initialize db in session state if not already present
	if "db" not in st.session_state:
	st.session_state.db = None

	# Search query input
	search_str = st.text_input("Enter search query:")

	if search_str:
	cached_data = fetch_github_json()
	models = search_models(search_str, cached_data)

	if models:
	model_ids = list(models.keys())
	selected_models = st.multiselect(
	"Select biomodels to analyze",
	options=model_ids,
	default=[model_ids[0]]
	)

	if st.button("Analyze Selected Models"):
	final_items = []
	for model_id in selected_models:
	model_data = models[model_id]

	st.write(f"Selected model: {model_data['name']}")

	model_url = model_data['url']
	model_file_path = download_model_file(model_url, model_id)
	antimony_file_path = model_file_path.replace(".xml", ".antimony")

	convert_sbml_to_antimony(model_file_path, antimony_file_path)
	final_items.extend(split_biomodels(antimony_file_path))

	if final_items:
	st.session_state.db = create_vector_db(final_items)
	st.write("Models have been processed and added to the database.")
	else:
	st.error("No items found in the models. Check if the Antimony files were generated correctly.")

	# Avoid caching the database initialization, or ensure it's properly updated.
	@st.cache_resource
	def get_messages():
	if "messages" not in st.session_state:
	st.session_state.messages = []
	return st.session_state.messages

	st.session_state.messages = get_messages()

	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Chat input section
	if prompt := st.chat_input("Ask a question about the models:"):
	st.chat_message("user").markdown(prompt)
	st.session_state.messages.append({"role": "user", "content": prompt})

	if st.session_state.db is None:
	st.error("Database is not initialized. Please process the models first.")
	else:
	response = generate_response(st.session_state.db, prompt, st.session_state.messages)

	with st.chat_message("assistant"):
	st.markdown(response)

	st.session_state.messages.append({"role": "assistant", "content": response})

	if __name__ == "__main__":
	streamlit_app()