Kevin Louis commited on
Commit
7c244fe
·
1 Parent(s): ed15e3a

Add application file

Browse files
Files changed (1) hide show
  1. app.py +175 -0
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import pandas as pd
4
+ from datasets import Dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from parameter_extractor import ParameterExtractor
7
+ from DNAseq import DNAseq
8
+ from helper import list_at_index_0, list_at_index_1, logger
9
+
10
+
11
+ def chat_to_sequence(sequence, user_query):
12
+ if sequence is None:
13
+ gr.Warning("Sequence Is Empty. Please Input A Sequence")
14
+ if user_query is None:
15
+ gr.Warning("Query Is Empty. Please Input A Query")
16
+ # Log information to a CSV file
17
+ log_filename = "CTS_user_log.csv"
18
+
19
+ # Sequence to be analysed/queried
20
+ input_sequence = sequence
21
+
22
+ # Set ParameterExtractor class expected variable
23
+ dna = input_sequence
24
+
25
+ # Model
26
+ model_name = "all-mpnet-base-v2"
27
+
28
+ # Load model
29
+ model = SentenceTransformer(model_name)
30
+
31
+ # User input
32
+ user_query = user_query
33
+
34
+ # Set ParameterExtractor class expected variable
35
+ query = user_query
36
+
37
+ # Bot Response
38
+ response = ""
39
+
40
+ # Query Code Description Message
41
+ code_descript_message = ''
42
+
43
+ # kNN semantic similarity threshold / used to determine if query can execute code
44
+ # kNN semantic similarity values less than the lower threshold should return a code eval response
45
+ # kNN semantic similarity values more than the lower threshold shouldn't return a code eval response
46
+ proximal_lower_threshold = 1.1
47
+ proximal_upper_threshold = 1.4
48
+
49
+ threshold_exceeded_message = "Your Query Wasn't Understood. Can You Rephrase The Query"
50
+ threshold_approximate_message = "Your Query Wasn't Understood Clearly. Try Using The Following Query Formats"
51
+
52
+ # Load the function mapping CSV file into a pandas DataFrame
53
+ code_function_mapping = pd.read_csv("code_function_mapping.csv")
54
+
55
+ # Load reference query database from JSON file back into a DataFrame
56
+ ref_query_df = pd.read_json('reference_query_db.json', orient='records')
57
+
58
+ # Create Dataset object using the pandas data frame
59
+ ref_query_ds = Dataset.from_pandas(ref_query_df)
60
+
61
+ # Load FAISS index
62
+ ref_query_ds.load_faiss_index('all-mpnet-base-v2_embeddings', 'ref_query_db_index')
63
+
64
+ # Create embeddings for user query
65
+ query_embedding = model.encode(user_query)
66
+
67
+ # Semantic similarity search user query against sample queries
68
+ index_result = ref_query_ds.get_nearest_examples("all-mpnet-base-v2_embeddings", query_embedding, k=3)
69
+ print(index_result)
70
+
71
+ # Retrieve results from dataset object
72
+ scores, examples = index_result
73
+
74
+ # Create a DataFrame from the examples dictionary
75
+ result_df = pd.DataFrame(examples)
76
+
77
+ # Add the scores as a new column to the DataFrame
78
+ result_df['score'] = scores
79
+
80
+ # Sort the DataFrame by the 'Score' column in ascending order
81
+ # FIASS uses kNN as the similarity algorithm / value of 0 indicates an exact match
82
+ sorted_df = result_df.sort_values(by='score', ascending=True)
83
+
84
+ # Get the query with the lowest kNN score (first row after sorting)
85
+ ref_question = sorted_df.iloc[0]['question']
86
+
87
+ # Get the code for the query with the lowest kNN score (first row after sorting)
88
+ query_code = sorted_df.iloc[0]['code']
89
+
90
+ # Get the score for the query with the lowest kNN score (first row after sorting)
91
+ query_score = sorted_df.iloc[0]['score']
92
+
93
+ # Description of query code to be executed
94
+ query_code_description = code_function_mapping[code_function_mapping['code'] == query_code]['description'].values[0]
95
+
96
+ # Print the query with the highest score
97
+ print(ref_question, query_code, query_score)
98
+ similarity_metric = "k nearest neighbours"
99
+
100
+ ref_question_2 = sorted_df.iloc[1]['question']
101
+ ref_question_3 = sorted_df.iloc[1]['question']
102
+ query_score_2 = sorted_df.iloc[1]['score']
103
+ query_score_3 = sorted_df.iloc[1]['score']
104
+
105
+ log_data = [
106
+ user_query,
107
+ ref_question,
108
+ query_score,
109
+ query_code,
110
+ ref_question_2,
111
+ query_score_2,
112
+ ref_question_3,
113
+ query_score_3,
114
+ similarity_metric,
115
+ model_name,
116
+ proximal_lower_threshold,
117
+ proximal_upper_threshold,
118
+ ]
119
+ # Check the query score against threshold values
120
+ if query_score >= proximal_upper_threshold:
121
+ response = threshold_exceeded_message
122
+ logger(log_filename, log_data, response)
123
+ print(threshold_exceeded_message)
124
+
125
+ elif proximal_lower_threshold < query_score < proximal_upper_threshold:
126
+ response = threshold_approximate_message + "/n" + ref_question
127
+ logger(log_filename, log_data, response)
128
+ print(threshold_approximate_message, ref_question)
129
+ else:
130
+ print("Execute query")
131
+ # Define the question
132
+ code = query_code
133
+
134
+ # Filter the DataFrame to find the code that matches the question
135
+ matching_row = code_function_mapping[code_function_mapping["code"] == code]
136
+
137
+ # Check if there is a match
138
+ if not matching_row.empty:
139
+ function = matching_row.iloc[0]["function"]
140
+ response = str(eval(function))
141
+ code_descript_message = query_code_description.title()
142
+ logger(log_filename, log_data, response)
143
+ else:
144
+ response = "Error processing query"
145
+ query_code = "No Match Error"
146
+ logger(log_filename, log_data, response)
147
+ print("No matching code found for the function:", code)
148
+
149
+ return response, code_descript_message
150
+ return response, code_descript_message
151
+
152
+
153
+ ChatToSequence = gr.Interface(
154
+ fn=chat_to_sequence,
155
+ inputs=[gr.Textbox(label="Sequence", placeholder="Input DNA Sequence..."),
156
+ gr.Textbox(label="Query", placeholder="Input Query...")],
157
+ outputs=[gr.Textbox(label="Response"), gr.Textbox(label="Action Executed")],
158
+ title="Chat-To-Sequence",
159
+ description="This Demo App Allows You To Explore Your DNA Sequence Using Natural Language",
160
+ theme=gr.themes.Soft(),
161
+ examples=[
162
+ ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
163
+ "What is the length of the sequence"],
164
+ ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
165
+ "How many guanines bases are there in the sequence"],
166
+ ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
167
+ "What is the base at position 10"],
168
+ ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
169
+ "What are the bases from position 2 to 10"],
170
+ ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
171
+ "How many bases are there from position 2 to 10"],
172
+ ],
173
+ ).queue()
174
+
175
+ ChatToSequence.launch(share=True)