File size: 6,601 Bytes
7c244fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr

import pandas as pd
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from parameter_extractor import ParameterExtractor
from DNAseq import DNAseq
from helper import list_at_index_0, list_at_index_1, logger


def chat_to_sequence(sequence, user_query):
    if sequence is None:
        gr.Warning("Sequence Is Empty. Please Input A Sequence")
    if user_query is None:
        gr.Warning("Query Is Empty. Please Input A Query")
    # Log information to a CSV file
    log_filename = "CTS_user_log.csv"

    # Sequence to be analysed/queried
    input_sequence = sequence

    # Set ParameterExtractor class expected variable
    dna = input_sequence

    # Model
    model_name = "all-mpnet-base-v2"

    # Load model
    model = SentenceTransformer(model_name)

    # User input
    user_query = user_query

    # Set ParameterExtractor class expected variable
    query = user_query

    # Bot Response
    response = ""

    # Query Code Description Message
    code_descript_message = ''

    # kNN semantic similarity threshold / used to determine if query can execute code
    # kNN semantic similarity values less than the lower threshold should return a code eval response
    # kNN semantic similarity values more than the lower threshold shouldn't return a code eval response
    proximal_lower_threshold = 1.1
    proximal_upper_threshold = 1.4

    threshold_exceeded_message = "Your Query Wasn't Understood. Can You Rephrase The Query"
    threshold_approximate_message = "Your Query Wasn't Understood Clearly. Try Using The Following Query Formats"

    # Load the function mapping CSV file into a pandas DataFrame
    code_function_mapping = pd.read_csv("code_function_mapping.csv")

    # Load reference query database from JSON file back into a DataFrame
    ref_query_df = pd.read_json('reference_query_db.json', orient='records')

    # Create Dataset object using the pandas data frame
    ref_query_ds = Dataset.from_pandas(ref_query_df)

    # Load FAISS index
    ref_query_ds.load_faiss_index('all-mpnet-base-v2_embeddings', 'ref_query_db_index')

    # Create embeddings for user query
    query_embedding = model.encode(user_query)

    # Semantic similarity search user query against sample queries
    index_result = ref_query_ds.get_nearest_examples("all-mpnet-base-v2_embeddings", query_embedding, k=3)
    print(index_result)

    # Retrieve results from dataset object
    scores, examples = index_result

    # Create a DataFrame from the examples dictionary
    result_df = pd.DataFrame(examples)

    # Add the scores as a new column to the DataFrame
    result_df['score'] = scores

    # Sort the DataFrame by the 'Score' column in ascending order
    # FIASS uses kNN as the similarity algorithm / value of 0 indicates an exact match
    sorted_df = result_df.sort_values(by='score', ascending=True)

    # Get the query with the lowest kNN score (first row after sorting)
    ref_question = sorted_df.iloc[0]['question']

    # Get the code for the query with the lowest kNN score (first row after sorting)
    query_code = sorted_df.iloc[0]['code']

    # Get the score for the query with the lowest kNN score (first row after sorting)
    query_score = sorted_df.iloc[0]['score']

    # Description of query code to be executed
    query_code_description = code_function_mapping[code_function_mapping['code'] == query_code]['description'].values[0]

    # Print the query with the highest score
    print(ref_question, query_code, query_score)
    similarity_metric = "k nearest neighbours"

    ref_question_2 = sorted_df.iloc[1]['question']
    ref_question_3 = sorted_df.iloc[1]['question']
    query_score_2 = sorted_df.iloc[1]['score']
    query_score_3 = sorted_df.iloc[1]['score']

    log_data = [
        user_query,
        ref_question,
        query_score,
        query_code,
        ref_question_2,
        query_score_2,
        ref_question_3,
        query_score_3,
        similarity_metric,
        model_name,
        proximal_lower_threshold,
        proximal_upper_threshold,
    ]
    # Check the query score against threshold values
    if query_score >= proximal_upper_threshold:
        response = threshold_exceeded_message
        logger(log_filename, log_data, response)
        print(threshold_exceeded_message)

    elif proximal_lower_threshold < query_score < proximal_upper_threshold:
        response = threshold_approximate_message + "/n" + ref_question
        logger(log_filename, log_data, response)
        print(threshold_approximate_message, ref_question)
    else:
        print("Execute query")
        # Define the question
        code = query_code

        # Filter the DataFrame to find the code that matches the question
        matching_row = code_function_mapping[code_function_mapping["code"] == code]

        # Check if there is a match
        if not matching_row.empty:
            function = matching_row.iloc[0]["function"]
            response = str(eval(function))
            code_descript_message = query_code_description.title()
            logger(log_filename, log_data, response)
        else:
            response = "Error processing query"
            query_code = "No Match Error"
            logger(log_filename, log_data, response)
            print("No matching code found for the function:", code)

        return response, code_descript_message
    return response, code_descript_message


ChatToSequence = gr.Interface(
    fn=chat_to_sequence,
    inputs=[gr.Textbox(label="Sequence", placeholder="Input DNA Sequence..."),
            gr.Textbox(label="Query", placeholder="Input Query...")],
    outputs=[gr.Textbox(label="Response"), gr.Textbox(label="Action Executed")],
    title="Chat-To-Sequence",
    description="This Demo App Allows You To Explore Your DNA Sequence Using Natural Language",
    theme=gr.themes.Soft(),
    examples=[
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
         "What is the length of the sequence"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
         "How many guanines bases are there in the sequence"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
         "What is the base at position 10"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
         "What are the bases from position 2 to 10"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaa",
         "How many bases are there from position 2 to 10"],
    ],
).queue()

ChatToSequence.launch(share=True)