Spaces:

HuggyMonkey
/

Chat-To-Sequence

Sleeping

Kevin Louis

Add application file

7c244fe over 1 year ago

6.6 kB

	import gradio as gr

	import pandas as pd
	from datasets import Dataset
	from sentence_transformers import SentenceTransformer
	from parameter_extractor import ParameterExtractor
	from DNAseq import DNAseq
	from helper import list_at_index_0, list_at_index_1, logger


	def chat_to_sequence(sequence, user_query):
	if sequence is None:
	gr.Warning("Sequence Is Empty. Please Input A Sequence")
	if user_query is None:
	gr.Warning("Query Is Empty. Please Input A Query")
	# Log information to a CSV file
	log_filename = "CTS_user_log.csv"

	# Sequence to be analysed/queried
	input_sequence = sequence

	# Set ParameterExtractor class expected variable
	dna = input_sequence

	# Model
	model_name = "all-mpnet-base-v2"

	# Load model
	model = SentenceTransformer(model_name)

	# User input
	user_query = user_query

	# Set ParameterExtractor class expected variable
	query = user_query

	# Bot Response
	response = ""

	# Query Code Description Message
	code_descript_message = ''

	# kNN semantic similarity threshold / used to determine if query can execute code
	# kNN semantic similarity values less than the lower threshold should return a code eval response
	# kNN semantic similarity values more than the lower threshold shouldn't return a code eval response
	proximal_lower_threshold = 1.1
	proximal_upper_threshold = 1.4

	threshold_exceeded_message = "Your Query Wasn't Understood. Can You Rephrase The Query"
	threshold_approximate_message = "Your Query Wasn't Understood Clearly. Try Using The Following Query Formats"

	# Load the function mapping CSV file into a pandas DataFrame
	code_function_mapping = pd.read_csv("code_function_mapping.csv")

	# Load reference query database from JSON file back into a DataFrame
	ref_query_df = pd.read_json('reference_query_db.json', orient='records')

	# Create Dataset object using the pandas data frame
	ref_query_ds = Dataset.from_pandas(ref_query_df)

	# Load FAISS index
	ref_query_ds.load_faiss_index('all-mpnet-base-v2_embeddings', 'ref_query_db_index')

	# Create embeddings for user query
	query_embedding = model.encode(user_query)

	# Semantic similarity search user query against sample queries
	index_result = ref_query_ds.get_nearest_examples("all-mpnet-base-v2_embeddings", query_embedding, k=3)
	print(index_result)

	# Retrieve results from dataset object
	scores, examples = index_result

	# Create a DataFrame from the examples dictionary
	result_df = pd.DataFrame(examples)

	# Add the scores as a new column to the DataFrame
	result_df['score'] = scores

	# Sort the DataFrame by the 'Score' column in ascending order
	# FIASS uses kNN as the similarity algorithm / value of 0 indicates an exact match
	sorted_df = result_df.sort_values(by='score', ascending=True)

	# Get the query with the lowest kNN score (first row after sorting)
	ref_question = sorted_df.iloc[0]['question']

	# Get the code for the query with the lowest kNN score (first row after sorting)
	query_code = sorted_df.iloc[0]['code']

	# Get the score for the query with the lowest kNN score (first row after sorting)
	query_score = sorted_df.iloc[0]['score']

	# Description of query code to be executed
	query_code_description = code_function_mapping[code_function_mapping['code'] == query_code]['description'].values[0]

	# Print the query with the highest score
	print(ref_question, query_code, query_score)
	similarity_metric = "k nearest neighbours"

	ref_question_2 = sorted_df.iloc[1]['question']
	ref_question_3 = sorted_df.iloc[1]['question']
	query_score_2 = sorted_df.iloc[1]['score']
	query_score_3 = sorted_df.iloc[1]['score']

	log_data = [
	user_query,
	ref_question,
	query_score,
	query_code,
	ref_question_2,
	query_score_2,
	ref_question_3,
	query_score_3,
	similarity_metric,
	model_name,
	proximal_lower_threshold,
	proximal_upper_threshold,
	]
	# Check the query score against threshold values
	if query_score >= proximal_upper_threshold:
	response = threshold_exceeded_message
	logger(log_filename, log_data, response)
	print(threshold_exceeded_message)

	elif proximal_lower_threshold < query_score < proximal_upper_threshold:
	response = threshold_approximate_message + "/n" + ref_question
	logger(log_filename, log_data, response)
	print(threshold_approximate_message, ref_question)
	else:
	print("Execute query")
	# Define the question
	code = query_code

	# Filter the DataFrame to find the code that matches the question
	matching_row = code_function_mapping[code_function_mapping["code"] == code]

	# Check if there is a match
	if not matching_row.empty:
	function = matching_row.iloc[0]["function"]
	response = str(eval(function))
	code_descript_message = query_code_description.title()
	logger(log_filename, log_data, response)
	else:
	response = "Error processing query"
	query_code = "No Match Error"
	logger(log_filename, log_data, response)
	print("No matching code found for the function:", code)

	return response, code_descript_message
	return response, code_descript_message


	ChatToSequence = gr.Interface(
	fn=chat_to_sequence,
	inputs=[gr.Textbox(label="Sequence", placeholder="Input DNA Sequence..."),
	gr.Textbox(label="Query", placeholder="Input Query...")],
	outputs=[gr.Textbox(label="Response"), gr.Textbox(label="Action Executed")],
	title="Chat-To-Sequence",
	description="This Demo App Allows You To Explore Your DNA Sequence Using Natural Language",
	theme=gr.themes.Soft(),
	examples=[
	["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
	"What is the length of the sequence"],
	["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
	"How many guanines bases are there in the sequence"],
	["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
	"What is the base at position 10"],
	["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
	"What are the bases from position 2 to 10"],
	["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
	"How many bases are there from position 2 to 10"],
	],
	).queue()

	ChatToSequence.launch(share=True)