TheBobBob
/

BioModelsRAG

Model card Files Files and versions Community

TheBobBob commited on Aug 31, 2024

Commit

03a7adf

verified ·

1 Parent(s): 79187c7

Upload core files

Browse files

Files changed (7) hide show

createDocuments.py +42 -0
createVectorDB.py +49 -0
generateResponse.py +44 -0
main.py +45 -0
rag2.py +93 -0
selectBioModels.py +81 -0
splitBioModels.py +50 -0

createDocuments.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import ollama
+from typing import List
+import chromadb
+def createDocuments(final_items: List[str], collection: chromadb.Collection) -> List[str]:
+    """Generates summaries of the BioModel chunks and adds them to the Chroma database collection
+    Args:
+        final_items (List[str]): The segmented BioModel database.
+        collection (chromadb.Collection): The Chroma database collection.
+    Returns:
+        List[str]: The documents that are passed to the Chroma database are in string form.
+    """
+    documents = []
+    for item in final_items:
+        print(item) #option for reporting or not
+        prompt = f"""Please summarize this segment of Antimony: {item}. The summaries must be clear and concise.
+        For Display Names, provide the value for each variable. Expand mathematical functions into words.
+        Cross reference all parts of the provided context.
+        Explain well without errors and in an easily understandable way. Write in a list format."""
+        documents5 = ollama.generate(model="llama3", prompt=prompt)
+        documents2 = documents5["response"]
+        documents.append(documents2)
+    # Add documents to the collection
+    collection.add(
+        documents=documents,
+        ids=[f"id{i}" for i in range(len(documents))]
+    )
+    return documents
+#unit test
+#documents = []
+#assert(isinstance(documents, list))
+#print("ok!")

createVectorDB.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import chromadb
+from chromadb.utils import embedding_functions
+from typing import Optional, Dict
+def createVectorDB(
+    collection_name: Optional[str],
+    chroma_data_path: Optional[str] = None,
+    embed_model: Optional[str] = "all-MiniLM-L6-v2",
+    metadata: Optional[Dict[str, str]] = None
+) -> chromadb.Collection:
+    """Creates the vector database to store embeddings.
+    Args:
+        collection_name (str): The name of the collection.
+        chroma_data_path (Optional[str]): Path for chroma embeddings.
+        embed_model (Optional[str]): Model name for embeddings.
+        metadata (Optional[Dict[str, str]]): Metadata for the collection.
+    Returns:
+        chromadb.Collection: The created collection object.
+    """
+    if chroma_data_path is None:
+        chroma_data_path = r"CHROMA_EMBEDDINGS_PATH"  # Default path if not provided
+    client = chromadb.PersistentClient(path=chroma_data_path)
+    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
+        model_name=embed_model
+    )
+    # Use provided metadata or default to empty dictionary
+    if metadata is None:
+        metadata = {"hnsw:space": "cosine"}
+    collection = client.create_collection(
+        collection_name=collection_name,
+        embedding_function=embedding_func,
+        metadata=metadata,
+    )
+    return collection
+#unsure how to create unittest
+#collection = createVectorDB(
+    #COLLECTION_NAME="123456789",
+    #C#HROMA_DATA_PATH=r"C:\Users\navan\Downloads\BioModelsRAG\CHROMA_EMBEDDINGS_PATH",
+    #EMBED_MODEL="all-MiniLM-L6-v2",
+    #metadata={"hnsw:space": "cosine"}

generateResponse.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import ollama
+from typing import List, Optional, Dict
+N_RESULTS = 20 #10 sections was derived based on the segments of the BioModel. Based on tests conducted, 10 sections provided the most optimal output.
+def generateResponse(query_text: str, collection: Optional[Dict] = None) -> str:
+    """Generates a response to a query based on the Chroma database collection.
+    Args:
+        query_text (str): The query to search for.
+        collection (Optional[Dict]): The Chroma collection object to use for querying.
+    Returns:
+        str: The response generated from the query.
+    """
+    if collection is None:
+        raise ValueError("Collection is not provided")
+    # Query the embedding database for similar documents
+    query_results = collection.query(
+        query_texts=query_text,
+        n_results=N_RESULTS,
+    )
+    # Extract the best recommendations from the query results
+    best_recommendation = query_results.get('documents', [])
+    # Create the prompt for the ollama model
+    prompt_template = f"""Use the following pieces of context to answer the question at the end. If you don't know the answer, say so.
+    This is the piece of context necessary: {best_recommendation}
+    Cross-reference all pieces of context to define variables and other unknown entities. Calculate mathematical values based on provided matching variables. Remember previous responses if asked a follow-up question.
+    Question: {query_text}
+    """
+    response = ollama.generate(model="llama3", prompt=prompt_template)
+    final_response = response.get('response', 'No response generated')
+    return final_response
+#from createVectorDB import collection
+#query = "What protein interacts with ach2?"
+#result = generateResponse(query_text=query, collection=collection)
+#print("Response:", result)

main.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from createVectorDB import createVectorDB
+from splitBioModels import splitBioModels
+from createDocuments import createDocuments
+from generateResponse import generateResponse
+from selectBioModels import search_biomodels
+from selectBioModels import copy_matching_files
+DATA_PATH = r"C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\2data"
+CHROMA_DATA_PATH = r"C:\Users\navan\Downloads\BioModelsRAG\CHROMA_EMBEDDINGS_PATH"
+directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data'
+output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv'
+final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models'
+user_keywords = input("Keyword you would like to search for: ").split()
+def main(report:bool = True, directory = DATA_PATH, chroma_data_path = CHROMA_DATA_PATH):
+    data = []
+    search_biomodels(directory, user_keywords, output_file)
+    copy_matching_files(output_file, directory, final_models_folder)
+    splitBioModels(directory=DATA_PATH, final_items=data)
+    collection = createVectorDB(
+        collection_name="123456789101112131415",
+        chroma_data_path=chroma_data_path,
+        embed_model="all-MiniLM-L6-v2",
+        metadata={"hnsw:space": "cosine"}
+    )
+    if report:
+        print("Collection created:", collection)
+    createDocuments(final_items=data, collection=collection)
+    if report:
+        print("Documents added to collection.")
+    query = "What protein interacts with DesensitizedAch2?"
+    result = generateResponse(query_text=query, collection=collection)
+    return result
+#name of the program running v
+if __name__ == "__main__":
+    result = main()
+    print(result)

rag2.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# -*- coding: utf-8 -*-
+"""RAG2
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1fskQmtugai5co1I64Hv3iAcKQKUKHAzU
+"""
+#####importPackages
+from langchain_text_splitters import CharacterTextSplitter
+import os
+import chromadb
+from chromadb.utils import embedding_functions
+import sentence_transformers
+from sentence_transformers import SentenceTransformer
+import ollama
+#####splitBioModels
+text_splitter2 = CharacterTextSplitter(
+    separator = "  // ",
+    chunk_size=100,
+    chunk_overlap=20,
+    length_function=len,
+    is_separator_regex=False,
+)
+final_items = []
+directory = r"data/*"
+files = os.listdir(directory)
+for file in files:
+    file_path = os.path.join(directory, file)
+    with open(file_path, 'r') as f:
+        file_content = f.read()
+        items = text_splitter2.create_documents([file_content])
+        final_items.extend(items)
+#####createVectorDB
+CHROMA_DATA_PATH = r"CHROMA_EMBEDDINGS_PATH"
+COLLECTION_NAME = "BioRAG_Collection"
+EMBED_MODEL = "all-MiniLM-L6-v2"
+client = chromadb.PersistentClient(path = CHROMA_DATA_PATH)
+embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
+    model_name=EMBED_MODEL
+)
+collection = client.create_collection(
+    name = "BioRAG_Collection",
+    embedding_function=embedding_func,
+    metadata={"hnsw:space": "cosine"},
+)
+documents = []
+#####createDocuments
+for item in final_items:
+    print(item)
+    prompt = f'Please summarize this segment of Antimony: {item}. The summaries must be clear and concise. For Display Names, provide the value for each variable. Expand mathematical functions into words. Cross reference all parts of the provided context. Explain well without errors and in an easily understandable way. Write in a list format. '
+    documents5 = ollama.generate(model = "llama3", prompt=prompt)
+    documents2 = documents5["response"]
+    documents.append(documents2)
+collection.add(
+    documents = documents,
+    ids=[f"id{i}" for i in range(len(documents))]
+)
+#####generateResponse
+while 1==1:
+    query_text = input("What question would you like to ask BioRAG? If you would like to end the session, please type 'STOP'." )
+    if query_text == "STOP":
+        break
+    query_results = collection.query(
+        query_texts = query_text,
+        n_results=5,
+    )
+    best_recommendation = query_results['documents']
+    prompt_template = f"""Use the following pieces of context to answer the question at the end. If you don't know the answer, say so.
+    This is the piece of context necessary: {best_recommendation}
+    Cross-reference all pieces of context to define variables and other unknown entities. Calculate mathematical values based on provided matching variables. Remember previous responses if asked a follow up question.
+    Question: {query_text}
+    """
+    response = ollama.generate(model = "llama3", prompt=prompt_template)
+    print(response['response'])

selectBioModels.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import re
+import pandas as pd
+import shutil
+# Function to search BioModels and create the CSV file
+def search_biomodels(directory, keywords, output_file):
+    biomodel_numbers_list = []
+    matching_biomodels = []
+    files = os.listdir(directory)
+    for file in files:
+        file_path = os.path.join(directory, file)
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                file_content = f.read()
+                # Find all biomodel numbers using a more flexible regex
+                biomodel_numbers = re.findall(r'biomodels\.db/(\w+)', file_content)
+                # Search for the biomodel name, case-insensitive, and allow variations
+                biomodel_name_match = re.search(rf'{re.escape(keywords[0])} is "([^"]+)"', file_content, re.IGNORECASE)
+                biomodel_name = biomodel_name_match.group(1) if biomodel_name_match else ''
+                def matches_keywords(name, keywords):
+                    # Check for any keyword match in the biomodel name, case-insensitive
+                    return any(keyword.lower() in name.lower() for keyword in keywords)
+                # If a matching biomodel name is found, save it
+                if biomodel_name and matches_keywords(biomodel_name, keywords):
+                    biomodel_numbers_list.extend(biomodel_numbers)
+                    matching_biomodels.extend([biomodel_name] * len(biomodel_numbers))
+        except Exception as e:
+            print(f"Error processing file {file_path}: {e}")
+    # Create a DataFrame from the collected data
+    df = pd.DataFrame({
+        'Biomodel Number': biomodel_numbers_list,
+        'Biomodel Name': [matching_biomodels[i] if i < len(matching_biomodels) else '' for i in range(len(biomodel_numbers_list))]
+    })
+    # Save the DataFrame to a CSV file
+    df.to_csv(output_file, index=False)
+    print(f"Data saved to {output_file}")
+# Function to copy matching files to final_models directory
+def copy_matching_files(csv_file, data_folder, final_models_folder):
+    # Create the final_models folder if it doesn't exist
+    os.makedirs(final_models_folder, exist_ok=True)
+    # Load the CSV file into a DataFrame
+    df = pd.read_csv(csv_file)
+    # Iterate through the data folder to find and copy matching files
+    for root, dirs, files in os.walk(data_folder):
+        for file in files:
+            file_path = os.path.join(root, file)
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+                # Check if any biomodel name or number is in the file
+                for i, row in df.iterrows():
+                    biomodel_number = row['Biomodel Number']
+                    biomodel_name = row['Biomodel Name']
+                    if (biomodel_name and biomodel_name.lower() in content.lower()) or biomodel_number in content:
+                        shutil.copy(file_path, final_models_folder)
+                        print(f"Copied: {file} to final_models")
+    print(f"All matching biomodel files have been copied to {final_models_folder}")
+# Main execution
+directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data'
+output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv'
+final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models'
+user_keywords = input("Keyword you would like to search for: ").split()
+# Search and copy files
+search_biomodels(directory, user_keywords, output_file)
+copy_matching_files(output_file, directory, final_models_folder)

splitBioModels.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from langchain_text_splitters import CharacterTextSplitter
+import os
+from typing import List, Optional
+def splitBioModels(directory: str, final_items: Optional[List[str]] = None) -> List[str]:
+    """Separates BioModel database based on indentation
+    Args:
+        directory (str): Relative path to the folder containing the files.
+        final_items (Optional[List[str]]): A list to store the split content. If None, a new list will be created.
+    Returns:
+        List[str]: A list of text chunks split from the BioModel files.
+    """
+    text_splitter2 = CharacterTextSplitter(
+        separator="  // ",
+        chunk_size=1000000000,
+        chunk_overlap=20,
+        length_function=len,
+        is_separator_regex=False
+    )
+    if final_items is None:
+        final_items = []
+    final_items = list(final_items)
+    directory_path = os.path.abspath(directory)
+    if not os.path.isdir(directory_path):
+        print(f"Directory not found: {directory_path}")
+        return final_items
+    files = os.listdir(directory_path)
+    for file in files:
+        file_path = os.path.join(directory_path, file)
+        try:
+            with open(file_path, 'r') as f:
+                last_part = os.path.basename(file_path)
+                file_content = f.read()
+                items = text_splitter2.create_documents([file_content])
+                for item in items:
+                    item.metadata = last_part
+                final_items.extend(items)
+        except Exception as e:
+            print(f"Error reading file {file_path}: {e}")
+    return final_items