TheBobBob commited on
Commit
03a7adf
·
verified ·
1 Parent(s): 79187c7

Upload core files

Browse files
Files changed (7) hide show
  1. createDocuments.py +42 -0
  2. createVectorDB.py +49 -0
  3. generateResponse.py +44 -0
  4. main.py +45 -0
  5. rag2.py +93 -0
  6. selectBioModels.py +81 -0
  7. splitBioModels.py +50 -0
createDocuments.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ollama
2
+ from typing import List
3
+ import chromadb
4
+
5
+ def createDocuments(final_items: List[str], collection: chromadb.Collection) -> List[str]:
6
+ """Generates summaries of the BioModel chunks and adds them to the Chroma database collection
7
+
8
+ Args:
9
+ final_items (List[str]): The segmented BioModel database.
10
+ collection (chromadb.Collection): The Chroma database collection.
11
+
12
+ Returns:
13
+ List[str]: The documents that are passed to the Chroma database are in string form.
14
+ """
15
+
16
+ documents = []
17
+ for item in final_items:
18
+ print(item) #option for reporting or not
19
+ prompt = f"""Please summarize this segment of Antimony: {item}. The summaries must be clear and concise.
20
+ For Display Names, provide the value for each variable. Expand mathematical functions into words.
21
+ Cross reference all parts of the provided context.
22
+ Explain well without errors and in an easily understandable way. Write in a list format."""
23
+ documents5 = ollama.generate(model="llama3", prompt=prompt)
24
+ documents2 = documents5["response"]
25
+ documents.append(documents2)
26
+
27
+ # Add documents to the collection
28
+ collection.add(
29
+ documents=documents,
30
+ ids=[f"id{i}" for i in range(len(documents))]
31
+ )
32
+
33
+ return documents
34
+
35
+
36
+
37
+ #unit test
38
+ #documents = []
39
+ #assert(isinstance(documents, list))
40
+ #print("ok!")
41
+
42
+
createVectorDB.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.utils import embedding_functions
3
+ from typing import Optional, Dict
4
+
5
+ def createVectorDB(
6
+ collection_name: Optional[str],
7
+ chroma_data_path: Optional[str] = None,
8
+ embed_model: Optional[str] = "all-MiniLM-L6-v2",
9
+ metadata: Optional[Dict[str, str]] = None
10
+ ) -> chromadb.Collection:
11
+ """Creates the vector database to store embeddings.
12
+
13
+ Args:
14
+ collection_name (str): The name of the collection.
15
+ chroma_data_path (Optional[str]): Path for chroma embeddings.
16
+ embed_model (Optional[str]): Model name for embeddings.
17
+ metadata (Optional[Dict[str, str]]): Metadata for the collection.
18
+
19
+ Returns:
20
+ chromadb.Collection: The created collection object.
21
+ """
22
+ if chroma_data_path is None:
23
+ chroma_data_path = r"CHROMA_EMBEDDINGS_PATH" # Default path if not provided
24
+
25
+ client = chromadb.PersistentClient(path=chroma_data_path)
26
+
27
+ embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
28
+ model_name=embed_model
29
+ )
30
+
31
+ # Use provided metadata or default to empty dictionary
32
+ if metadata is None:
33
+ metadata = {"hnsw:space": "cosine"}
34
+
35
+ collection = client.create_collection(
36
+ collection_name=collection_name,
37
+ embedding_function=embedding_func,
38
+ metadata=metadata,
39
+ )
40
+
41
+ return collection
42
+
43
+ #unsure how to create unittest
44
+
45
+ #collection = createVectorDB(
46
+ #COLLECTION_NAME="123456789",
47
+ #C#HROMA_DATA_PATH=r"C:\Users\navan\Downloads\BioModelsRAG\CHROMA_EMBEDDINGS_PATH",
48
+ #EMBED_MODEL="all-MiniLM-L6-v2",
49
+ #metadata={"hnsw:space": "cosine"}
generateResponse.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ollama
2
+ from typing import List, Optional, Dict
3
+
4
+ N_RESULTS = 20 #10 sections was derived based on the segments of the BioModel. Based on tests conducted, 10 sections provided the most optimal output.
5
+ def generateResponse(query_text: str, collection: Optional[Dict] = None) -> str:
6
+ """Generates a response to a query based on the Chroma database collection.
7
+
8
+ Args:
9
+ query_text (str): The query to search for.
10
+ collection (Optional[Dict]): The Chroma collection object to use for querying.
11
+
12
+ Returns:
13
+ str: The response generated from the query.
14
+ """
15
+ if collection is None:
16
+ raise ValueError("Collection is not provided")
17
+
18
+ # Query the embedding database for similar documents
19
+ query_results = collection.query(
20
+ query_texts=query_text,
21
+ n_results=N_RESULTS,
22
+ )
23
+
24
+ # Extract the best recommendations from the query results
25
+ best_recommendation = query_results.get('documents', [])
26
+
27
+ # Create the prompt for the ollama model
28
+ prompt_template = f"""Use the following pieces of context to answer the question at the end. If you don't know the answer, say so.
29
+
30
+ This is the piece of context necessary: {best_recommendation}
31
+
32
+ Cross-reference all pieces of context to define variables and other unknown entities. Calculate mathematical values based on provided matching variables. Remember previous responses if asked a follow-up question.
33
+
34
+ Question: {query_text}
35
+
36
+ """
37
+ response = ollama.generate(model="llama3", prompt=prompt_template)
38
+ final_response = response.get('response', 'No response generated')
39
+ return final_response
40
+
41
+ #from createVectorDB import collection
42
+ #query = "What protein interacts with ach2?"
43
+ #result = generateResponse(query_text=query, collection=collection)
44
+ #print("Response:", result)
main.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from createVectorDB import createVectorDB
2
+ from splitBioModels import splitBioModels
3
+ from createDocuments import createDocuments
4
+ from generateResponse import generateResponse
5
+ from selectBioModels import search_biomodels
6
+ from selectBioModels import copy_matching_files
7
+
8
+ DATA_PATH = r"C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\2data"
9
+ CHROMA_DATA_PATH = r"C:\Users\navan\Downloads\BioModelsRAG\CHROMA_EMBEDDINGS_PATH"
10
+ directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data'
11
+ output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv'
12
+ final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models'
13
+ user_keywords = input("Keyword you would like to search for: ").split()
14
+
15
+
16
+ def main(report:bool = True, directory = DATA_PATH, chroma_data_path = CHROMA_DATA_PATH):
17
+ data = []
18
+ search_biomodels(directory, user_keywords, output_file)
19
+ copy_matching_files(output_file, directory, final_models_folder)
20
+
21
+ splitBioModels(directory=DATA_PATH, final_items=data)
22
+
23
+ collection = createVectorDB(
24
+ collection_name="123456789101112131415",
25
+ chroma_data_path=chroma_data_path,
26
+ embed_model="all-MiniLM-L6-v2",
27
+ metadata={"hnsw:space": "cosine"}
28
+ )
29
+
30
+ if report:
31
+ print("Collection created:", collection)
32
+
33
+ createDocuments(final_items=data, collection=collection)
34
+
35
+ if report:
36
+ print("Documents added to collection.")
37
+
38
+ query = "What protein interacts with DesensitizedAch2?"
39
+ result = generateResponse(query_text=query, collection=collection)
40
+ return result
41
+ #name of the program running v
42
+ if __name__ == "__main__":
43
+ result = main()
44
+ print(result)
45
+
rag2.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """RAG2
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1fskQmtugai5co1I64Hv3iAcKQKUKHAzU
8
+ """
9
+ #####importPackages
10
+ from langchain_text_splitters import CharacterTextSplitter
11
+ import os
12
+ import chromadb
13
+ from chromadb.utils import embedding_functions
14
+ import sentence_transformers
15
+ from sentence_transformers import SentenceTransformer
16
+ import ollama
17
+
18
+
19
+ #####splitBioModels
20
+ text_splitter2 = CharacterTextSplitter(
21
+ separator = " // ",
22
+ chunk_size=100,
23
+ chunk_overlap=20,
24
+ length_function=len,
25
+ is_separator_regex=False,
26
+ )
27
+
28
+ final_items = []
29
+
30
+ directory = r"data/*"
31
+ files = os.listdir(directory)
32
+
33
+ for file in files:
34
+ file_path = os.path.join(directory, file)
35
+ with open(file_path, 'r') as f:
36
+ file_content = f.read()
37
+ items = text_splitter2.create_documents([file_content])
38
+ final_items.extend(items)
39
+
40
+ #####createVectorDB
41
+
42
+ CHROMA_DATA_PATH = r"CHROMA_EMBEDDINGS_PATH"
43
+ COLLECTION_NAME = "BioRAG_Collection"
44
+ EMBED_MODEL = "all-MiniLM-L6-v2"
45
+ client = chromadb.PersistentClient(path = CHROMA_DATA_PATH)
46
+
47
+ embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
48
+ model_name=EMBED_MODEL
49
+ )
50
+
51
+ collection = client.create_collection(
52
+ name = "BioRAG_Collection",
53
+ embedding_function=embedding_func,
54
+ metadata={"hnsw:space": "cosine"},
55
+ )
56
+
57
+ documents = []
58
+
59
+ #####createDocuments
60
+ for item in final_items:
61
+ print(item)
62
+ prompt = f'Please summarize this segment of Antimony: {item}. The summaries must be clear and concise. For Display Names, provide the value for each variable. Expand mathematical functions into words. Cross reference all parts of the provided context. Explain well without errors and in an easily understandable way. Write in a list format. '
63
+ documents5 = ollama.generate(model = "llama3", prompt=prompt)
64
+ documents2 = documents5["response"]
65
+ documents.append(documents2)
66
+
67
+ collection.add(
68
+ documents = documents,
69
+ ids=[f"id{i}" for i in range(len(documents))]
70
+ )
71
+
72
+ #####generateResponse
73
+ while 1==1:
74
+ query_text = input("What question would you like to ask BioRAG? If you would like to end the session, please type 'STOP'." )
75
+ if query_text == "STOP":
76
+ break
77
+ query_results = collection.query(
78
+ query_texts = query_text,
79
+ n_results=5,
80
+ )
81
+ best_recommendation = query_results['documents']
82
+
83
+ prompt_template = f"""Use the following pieces of context to answer the question at the end. If you don't know the answer, say so.
84
+
85
+ This is the piece of context necessary: {best_recommendation}
86
+
87
+ Cross-reference all pieces of context to define variables and other unknown entities. Calculate mathematical values based on provided matching variables. Remember previous responses if asked a follow up question.
88
+
89
+ Question: {query_text}
90
+
91
+ """
92
+ response = ollama.generate(model = "llama3", prompt=prompt_template)
93
+ print(response['response'])
selectBioModels.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ import shutil
5
+
6
+ # Function to search BioModels and create the CSV file
7
+ def search_biomodels(directory, keywords, output_file):
8
+ biomodel_numbers_list = []
9
+ matching_biomodels = []
10
+
11
+ files = os.listdir(directory)
12
+
13
+ for file in files:
14
+ file_path = os.path.join(directory, file)
15
+
16
+ try:
17
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
18
+ file_content = f.read()
19
+
20
+ # Find all biomodel numbers using a more flexible regex
21
+ biomodel_numbers = re.findall(r'biomodels\.db/(\w+)', file_content)
22
+
23
+ # Search for the biomodel name, case-insensitive, and allow variations
24
+ biomodel_name_match = re.search(rf'{re.escape(keywords[0])} is "([^"]+)"', file_content, re.IGNORECASE)
25
+ biomodel_name = biomodel_name_match.group(1) if biomodel_name_match else ''
26
+
27
+ def matches_keywords(name, keywords):
28
+ # Check for any keyword match in the biomodel name, case-insensitive
29
+ return any(keyword.lower() in name.lower() for keyword in keywords)
30
+
31
+ # If a matching biomodel name is found, save it
32
+ if biomodel_name and matches_keywords(biomodel_name, keywords):
33
+ biomodel_numbers_list.extend(biomodel_numbers)
34
+ matching_biomodels.extend([biomodel_name] * len(biomodel_numbers))
35
+
36
+ except Exception as e:
37
+ print(f"Error processing file {file_path}: {e}")
38
+
39
+ # Create a DataFrame from the collected data
40
+ df = pd.DataFrame({
41
+ 'Biomodel Number': biomodel_numbers_list,
42
+ 'Biomodel Name': [matching_biomodels[i] if i < len(matching_biomodels) else '' for i in range(len(biomodel_numbers_list))]
43
+ })
44
+
45
+ # Save the DataFrame to a CSV file
46
+ df.to_csv(output_file, index=False)
47
+ print(f"Data saved to {output_file}")
48
+
49
+ # Function to copy matching files to final_models directory
50
+ def copy_matching_files(csv_file, data_folder, final_models_folder):
51
+ # Create the final_models folder if it doesn't exist
52
+ os.makedirs(final_models_folder, exist_ok=True)
53
+
54
+ # Load the CSV file into a DataFrame
55
+ df = pd.read_csv(csv_file)
56
+
57
+ # Iterate through the data folder to find and copy matching files
58
+ for root, dirs, files in os.walk(data_folder):
59
+ for file in files:
60
+ file_path = os.path.join(root, file)
61
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
62
+ content = f.read()
63
+ # Check if any biomodel name or number is in the file
64
+ for i, row in df.iterrows():
65
+ biomodel_number = row['Biomodel Number']
66
+ biomodel_name = row['Biomodel Name']
67
+ if (biomodel_name and biomodel_name.lower() in content.lower()) or biomodel_number in content:
68
+ shutil.copy(file_path, final_models_folder)
69
+ print(f"Copied: {file} to final_models")
70
+
71
+ print(f"All matching biomodel files have been copied to {final_models_folder}")
72
+
73
+ # Main execution
74
+ directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data'
75
+ output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv'
76
+ final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models'
77
+ user_keywords = input("Keyword you would like to search for: ").split()
78
+
79
+ # Search and copy files
80
+ search_biomodels(directory, user_keywords, output_file)
81
+ copy_matching_files(output_file, directory, final_models_folder)
splitBioModels.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import CharacterTextSplitter
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ def splitBioModels(directory: str, final_items: Optional[List[str]] = None) -> List[str]:
6
+ """Separates BioModel database based on indentation
7
+
8
+ Args:
9
+ directory (str): Relative path to the folder containing the files.
10
+ final_items (Optional[List[str]]): A list to store the split content. If None, a new list will be created.
11
+
12
+ Returns:
13
+ List[str]: A list of text chunks split from the BioModel files.
14
+ """
15
+ text_splitter2 = CharacterTextSplitter(
16
+ separator=" // ",
17
+ chunk_size=1000000000,
18
+ chunk_overlap=20,
19
+ length_function=len,
20
+ is_separator_regex=False
21
+ )
22
+
23
+ if final_items is None:
24
+ final_items = []
25
+ final_items = list(final_items)
26
+
27
+ directory_path = os.path.abspath(directory)
28
+ if not os.path.isdir(directory_path):
29
+ print(f"Directory not found: {directory_path}")
30
+ return final_items
31
+
32
+ files = os.listdir(directory_path)
33
+ for file in files:
34
+ file_path = os.path.join(directory_path, file)
35
+ try:
36
+ with open(file_path, 'r') as f:
37
+ last_part = os.path.basename(file_path)
38
+ file_content = f.read()
39
+ items = text_splitter2.create_documents([file_content])
40
+ for item in items:
41
+ item.metadata = last_part
42
+ final_items.extend(items)
43
+ except Exception as e:
44
+ print(f"Error reading file {file_path}: {e}")
45
+
46
+ return final_items
47
+
48
+
49
+
50
+