Spaces:
Sleeping
Sleeping
File size: 6,601 Bytes
7c244fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import gradio as gr
import pandas as pd
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from parameter_extractor import ParameterExtractor
from DNAseq import DNAseq
from helper import list_at_index_0, list_at_index_1, logger
def chat_to_sequence(sequence, user_query):
if sequence is None:
gr.Warning("Sequence Is Empty. Please Input A Sequence")
if user_query is None:
gr.Warning("Query Is Empty. Please Input A Query")
# Log information to a CSV file
log_filename = "CTS_user_log.csv"
# Sequence to be analysed/queried
input_sequence = sequence
# Set ParameterExtractor class expected variable
dna = input_sequence
# Model
model_name = "all-mpnet-base-v2"
# Load model
model = SentenceTransformer(model_name)
# User input
user_query = user_query
# Set ParameterExtractor class expected variable
query = user_query
# Bot Response
response = ""
# Query Code Description Message
code_descript_message = ''
# kNN semantic similarity threshold / used to determine if query can execute code
# kNN semantic similarity values less than the lower threshold should return a code eval response
# kNN semantic similarity values more than the lower threshold shouldn't return a code eval response
proximal_lower_threshold = 1.1
proximal_upper_threshold = 1.4
threshold_exceeded_message = "Your Query Wasn't Understood. Can You Rephrase The Query"
threshold_approximate_message = "Your Query Wasn't Understood Clearly. Try Using The Following Query Formats"
# Load the function mapping CSV file into a pandas DataFrame
code_function_mapping = pd.read_csv("code_function_mapping.csv")
# Load reference query database from JSON file back into a DataFrame
ref_query_df = pd.read_json('reference_query_db.json', orient='records')
# Create Dataset object using the pandas data frame
ref_query_ds = Dataset.from_pandas(ref_query_df)
# Load FAISS index
ref_query_ds.load_faiss_index('all-mpnet-base-v2_embeddings', 'ref_query_db_index')
# Create embeddings for user query
query_embedding = model.encode(user_query)
# Semantic similarity search user query against sample queries
index_result = ref_query_ds.get_nearest_examples("all-mpnet-base-v2_embeddings", query_embedding, k=3)
print(index_result)
# Retrieve results from dataset object
scores, examples = index_result
# Create a DataFrame from the examples dictionary
result_df = pd.DataFrame(examples)
# Add the scores as a new column to the DataFrame
result_df['score'] = scores
# Sort the DataFrame by the 'Score' column in ascending order
# FIASS uses kNN as the similarity algorithm / value of 0 indicates an exact match
sorted_df = result_df.sort_values(by='score', ascending=True)
# Get the query with the lowest kNN score (first row after sorting)
ref_question = sorted_df.iloc[0]['question']
# Get the code for the query with the lowest kNN score (first row after sorting)
query_code = sorted_df.iloc[0]['code']
# Get the score for the query with the lowest kNN score (first row after sorting)
query_score = sorted_df.iloc[0]['score']
# Description of query code to be executed
query_code_description = code_function_mapping[code_function_mapping['code'] == query_code]['description'].values[0]
# Print the query with the highest score
print(ref_question, query_code, query_score)
similarity_metric = "k nearest neighbours"
ref_question_2 = sorted_df.iloc[1]['question']
ref_question_3 = sorted_df.iloc[1]['question']
query_score_2 = sorted_df.iloc[1]['score']
query_score_3 = sorted_df.iloc[1]['score']
log_data = [
user_query,
ref_question,
query_score,
query_code,
ref_question_2,
query_score_2,
ref_question_3,
query_score_3,
similarity_metric,
model_name,
proximal_lower_threshold,
proximal_upper_threshold,
]
# Check the query score against threshold values
if query_score >= proximal_upper_threshold:
response = threshold_exceeded_message
logger(log_filename, log_data, response)
print(threshold_exceeded_message)
elif proximal_lower_threshold < query_score < proximal_upper_threshold:
response = threshold_approximate_message + "/n" + ref_question
logger(log_filename, log_data, response)
print(threshold_approximate_message, ref_question)
else:
print("Execute query")
# Define the question
code = query_code
# Filter the DataFrame to find the code that matches the question
matching_row = code_function_mapping[code_function_mapping["code"] == code]
# Check if there is a match
if not matching_row.empty:
function = matching_row.iloc[0]["function"]
response = str(eval(function))
code_descript_message = query_code_description.title()
logger(log_filename, log_data, response)
else:
response = "Error processing query"
query_code = "No Match Error"
logger(log_filename, log_data, response)
print("No matching code found for the function:", code)
return response, code_descript_message
return response, code_descript_message
ChatToSequence = gr.Interface(
fn=chat_to_sequence,
inputs=[gr.Textbox(label="Sequence", placeholder="Input DNA Sequence..."),
gr.Textbox(label="Query", placeholder="Input Query...")],
outputs=[gr.Textbox(label="Response"), gr.Textbox(label="Action Executed")],
title="Chat-To-Sequence",
description="This Demo App Allows You To Explore Your DNA Sequence Using Natural Language",
theme=gr.themes.Soft(),
examples=[
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
"What is the length of the sequence"],
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
"How many guanines bases are there in the sequence"],
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
"What is the base at position 10"],
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
"What are the bases from position 2 to 10"],
["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
"How many bases are there from position 2 to 10"],
],
).queue()
ChatToSequence.launch(share=True)
|