Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on 8 days ago

Commit

668db15

1 Parent(s): f0e965f

working version of vanna

Browse files

Files changed (11) hide show

.gitignore +2 -1
app.py +4 -4
climateqa/chat.py +10 -9
climateqa/engine/chains/intent_categorization.py +1 -0
climateqa/engine/chains/query_transformation.py +2 -0
climateqa/engine/graph.py +1 -1
climateqa/engine/talk_to_data/deprecated_vanna_remote.py +0 -167
climateqa/engine/talk_to_data/main.py +6 -7
climateqa/engine/talk_to_data/step_by_step_vanna copy.ipynb +0 -0
climateqa/engine/talk_to_data/vanna_class.py +1 -0
style.css +5 -0

.gitignore CHANGED Viewed

@@ -15,4 +15,5 @@ sandbox/
 climateqa/talk_to_data/database/
 *.db
-data_ingestion/

 climateqa/talk_to_data/database/
 *.db
+data_ingestion/
+.vscode

app.py CHANGED Viewed

@@ -141,7 +141,7 @@ def cqa_tab(tab_name):
                                     "<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",
                                     elem_id="graphs-container"
                                 )
-                            with gr.Tab("Vanna", elem_id="tab-vanna", id=6) as tab_vanna:
                                 vanna_table = gr.DataFrame([], elem_id="vanna-display")
                                 vanna_display = gr.Plot()
@@ -226,13 +226,13 @@ def event_handling(
     # Event for textbox
     (textbox
         .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{textbox.elem_id}")
-        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs, ttd_data], concurrency_limit=8, api_name=f"chat_{textbox.elem_id}")
         .then(finish_chat, None, [textbox], api_name=f"finish_chat_{textbox.elem_id}")
     )
     # Event for examples_hidden
     (examples_hidden
         .change(start_chat, [examples_hidden, chatbot, search_only], [examples_hidden, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{examples_hidden.elem_id}")
-        .then(chat, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs, ttd_data], concurrency_limit=8, api_name=f"chat_{examples_hidden.elem_id}")
         .then(finish_chat, None, [examples_hidden], api_name=f"finish_chat_{examples_hidden.elem_id}")
     )
@@ -249,7 +249,7 @@ def event_handling(
     for component in [textbox, examples_hidden]:
         component.submit(find_papers, [component, after, dropdown_external_sources], [papers_html, citations_network, papers_summary])
-    # ttd_data.change(lambda x: x["df_output"], inputs=[ttd_data], outputs=[vanna_display])
     textbox.submit(ask_vanna, [textbox], [vanna_table, vanna_display])
 def main_ui():

                                     "<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",
                                     elem_id="graphs-container"
                                 )
+                            with gr.Tab("DRIAS", elem_id="tab-vanna", id=6) as tab_vanna:
                                 vanna_table = gr.DataFrame([], elem_id="vanna-display")
                                 vanna_display = gr.Plot()
     # Event for textbox
     (textbox
         .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{textbox.elem_id}")
+        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name=f"chat_{textbox.elem_id}")
         .then(finish_chat, None, [textbox], api_name=f"finish_chat_{textbox.elem_id}")
     )
     # Event for examples_hidden
     (examples_hidden
         .change(start_chat, [examples_hidden, chatbot, search_only], [examples_hidden, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{examples_hidden.elem_id}")
+        .then(chat, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name=f"chat_{examples_hidden.elem_id}")
         .then(finish_chat, None, [examples_hidden], api_name=f"finish_chat_{examples_hidden.elem_id}")
     )
     for component in [textbox, examples_hidden]:
         component.submit(find_papers, [component, after, dropdown_external_sources], [papers_html, citations_network, papers_summary])
+    # Drias search
     textbox.submit(ask_vanna, [textbox], [vanna_table, vanna_display])
 def main_ui():

climateqa/chat.py CHANGED Viewed

@@ -58,7 +58,8 @@ def handle_numerical_data(event):
     if event["name"] == "retrieve_drias_data" and event["event"] == "on_chain_end":
         numerical_data = event["data"]["output"]["drias_data"]
         sql_query = event["data"]["output"]["drias_sql_query"]
-    return numerical_data, sql_query
 # Main chat function
 async def chat_stream(
@@ -148,12 +149,12 @@ async def chat_stream(
                     history, used_documents = handle_retrieved_documents(
                         event, history, used_documents
                     )
-                # Handle document retrieval
-                if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
-                    df_output_vanna, sql_query = handle_numerical_data(
-                        event
-                    )
-                    vanna_data = {"df_output": df_output_vanna, "sql_query": sql_query}
                 if event["event"] == "on_chain_end" and event["name"] == "answer_search" :
@@ -198,7 +199,7 @@ async def chat_stream(
                         sub_questions = [q["question"] for q in event["data"]["output"]["questions_list"]]
                         history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
-            yield history, docs_html, output_query, output_language, related_contents, graphs_html, vanna_data
     except Exception as e:
         print(f"Event {event} has failed")
@@ -209,4 +210,4 @@ async def chat_stream(
     # Call the function to log interaction
     log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id)
-    yield history, docs_html, output_query, output_language, related_contents, graphs_html, vanna_data

     if event["name"] == "retrieve_drias_data" and event["event"] == "on_chain_end":
         numerical_data = event["data"]["output"]["drias_data"]
         sql_query = event["data"]["output"]["drias_sql_query"]
+        return numerical_data, sql_query
+    return None, None
 # Main chat function
 async def chat_stream(
                     history, used_documents = handle_retrieved_documents(
                         event, history, used_documents
                     )
+                # Handle Vanna retrieval
+                # if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
+                #     df_output_vanna, sql_query = handle_numerical_data(
+                #         event
+                #     )
+                #     vanna_data = {"df_output": df_output_vanna, "sql_query": sql_query}
                 if event["event"] == "on_chain_end" and event["name"] == "answer_search" :
                         sub_questions = [q["question"] for q in event["data"]["output"]["questions_list"]]
                         history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
+            yield history, docs_html, output_query, output_language, related_contents, graphs_html#, vanna_data
     except Exception as e:
         print(f"Event {event} has failed")
     # Call the function to log interaction
     log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id)
+    yield history, docs_html, output_query, output_language, related_contents, graphs_html#, vanna_data

climateqa/engine/chains/intent_categorization.py CHANGED Viewed

@@ -57,6 +57,7 @@ def make_intent_categorization_node(llm):
     categorization_chain = make_intent_categorization_chain(llm)
     def categorize_message(state):
         print("---- Categorize_message ----")
         output = categorization_chain.invoke({"input": state["user_input"]})

     categorization_chain = make_intent_categorization_chain(llm)
     def categorize_message(state):
+        print("Input Message : ", state["user_input"])
         print("---- Categorize_message ----")
         output = categorization_chain.invoke({"input": state["user_input"]})

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -293,6 +293,8 @@ def make_query_transform_node(llm,k_final=15):
             "n_questions":n_questions,
             "handled_questions_index":[],
         }
         return new_state
     return transform_query

             "n_questions":n_questions,
             "handled_questions_index":[],
         }
+        print("New questions")
+        print(new_questions)
         return new_state
     return transform_query

climateqa/engine/graph.py CHANGED Viewed

@@ -75,7 +75,7 @@ def route_intent(state):
 def chitchat_route_intent(state):
     intent = state["search_graphs_chitchat"]
     if intent is True:
-        return "retrieve_graphs_chitchat"
     elif intent is False:
         return END

 def chitchat_route_intent(state):
     intent = state["search_graphs_chitchat"]
     if intent is True:
+        return END #TODO
     elif intent is False:
         return END

climateqa/engine/talk_to_data/deprecated_vanna_remote.py DELETED Viewed

@@ -1,167 +0,0 @@
-# from vanna.remote import VannaDefault
-# from pinecone import Pinecone
-# from climateqa.engine.embeddings import get_embeddings_function
-# import pandas as pd
-# import hashlib
-# class MyCustomVectorDB(VannaDefault):
-#     """
-#     VectorDB class for storing and retrieving vectors from Pinecone.
-#     args :
-#         config (dict) : Configuration dictionary containing the Pinecone API key and the index name :
-#             - pc_api_key (str) : Pinecone API key
-#             - index_name (str) : Pinecone index name
-#             - top_k (int) : Number of top results to return (default = 2)
-#     """
-#     def __init__(self,config, **kwargs):
-#         super().__init__(**kwargs)
-#         try :
-#             self.api_key = config.get('pc_api_key')
-#             self.index_name = config.get('index_name')
-#         except :
-#             raise Exception("Please provide the Pinecone API key and the index name")
-#         self.pc = Pinecone(api_key = self.api_key)
-#         self.index = self.pc.Index(self.index_name)
-#         self.top_k = config.get('top_k', 2)
-#         self.embeddings = get_embeddings_function()
-#     def check_embedding(self, id, namespace):
-#         fetched = self.index.fetch(ids = [id], namespace = namespace)
-#         if fetched['vectors'] == {}:
-#             return False
-#         return True
-#     def generate_hash_id(self, data: str) -> str:
-#         """
-#         Generate a unique hash ID for the given data.
-#         Args:
-#             data (str): The input data to hash (e.g., a concatenated string of user attributes).
-#         Returns:
-#             str: A unique hash ID as a hexadecimal string.
-#         """
-#         data_bytes = data.encode('utf-8')
-#         hash_object = hashlib.sha256(data_bytes)
-#         hash_id = hash_object.hexdigest()
-#         return hash_id
-#     def add_ddl(self, ddl: str, **kwargs) -> str:
-#         id = self.generate_hash_id(ddl) + '_ddl'
-#         if self.check_embedding(id, 'ddl'):
-#             print(f"DDL having id {id} already exists")
-#             return id
-#         self.index.upsert(
-#             vectors = [(id, self.embeddings.embed_query(ddl), {'ddl': ddl})],
-#             namespace = 'ddl'
-#         )
-#         return id
-#     def add_documentation(self, doc: str, **kwargs) -> str:
-#         id = self.generate_hash_id(doc) + '_doc'
-#         if self.check_embedding(id, 'documentation'):
-#             print(f"Documentation having id {id} already exists")
-#             return id
-#         self.index.upsert(
-#             vectors = [(id, self.embeddings.embed_query(doc), {'doc': doc})],
-#             namespace = 'documentation'
-#         )
-#         return id
-#     def add_question_sql(self, question: str, sql: str, **kwargs) -> str:
-#         id = self.generate_hash_id(question) + '_sql'
-#         if self.check_embedding(id, 'question_sql'):
-#             print(f"Question-SQL pair having id {id} already exists")
-#             return id
-#         self.index.upsert(
-#             vectors = [(id, self.embeddings.embed_query(question + sql), {'question': question, 'sql': sql})],
-#             namespace = 'question_sql'
-#         )
-#         return id
-#     def get_related_ddl(self, question: str, **kwargs) -> list:
-#         res = self.index.query(
-#             vector=self.embeddings.embed_query(question),
-#             top_k=self.top_k,
-#             namespace='ddl',
-#             include_metadata=True
-#         )
-#         print([match['metadata']['ddl'] for match in res['matches']])
-#         return [match['metadata']['ddl'] for match in res['matches']]
-#     def get_related_documentation(self, question: str, **kwargs) -> list:
-#         res = self.index.query(
-#             vector=self.embeddings.embed_query(question),
-#             top_k=self.top_k,
-#             namespace='documentation',
-#             include_metadata=True
-#         )
-#         return [match['metadata']['doc'] for match in res['matches']]
-#     def get_similar_quetion_sql(self, question: str, **kwargs) -> list:
-#         res = self.index.query(
-#             vector=self.embeddings.embed_query(question),
-#             top_k=self.top_k,
-#             namespace='question_sql',
-#             include_metadata=True
-#         )
-#         return [(match['metadata']['question'], match['metadata']['sql']) for match in res['matches']]
-#     def get_training_data(self, **kwargs) -> pd.DataFrame:
-#         list_of_data = []
-#         namespaces = ['ddl', 'documentation', 'question_sql']
-#         for namespace in namespaces:
-#             data = self.index.query(
-#             top_k=10000,
-#             namespace=namespace,
-#             include_metadata=True,
-#             include_values=False
-#             )
-#             for match in data['matches']:
-#                 list_of_data.append(match['metadata'])
-#         return pd.DataFrame(list_of_data)
-#     def remove_training_data(self, id: str, **kwargs) -> bool:
-#         if id.endswith("_ddl"):
-#             self.Index.delete(ids=[id], namespace="_ddl")
-#             return True
-#         if id.endswith("_sql"):
-#             self.index.delete(ids=[id], namespace="_sql")
-#             return True
-#         if id.endswith("_doc"):
-#             self.Index.delete(ids=[id], namespace="_doc")
-#             return True
-#         return False

climateqa/engine/talk_to_data/main.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sqlite3
 import os
 import pandas as pd
 from climateqa.engine.llm import get_llm
 from dotenv import load_dotenv
@@ -17,7 +18,7 @@ VANNA_MODEL = os.getenv('VANNA_MODEL')
 #Vanna object
-vn = MyVanna(config = {"temperature": 0, "api_key": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME})
 db_vanna_path = os.path.join(os.path.dirname(__file__), "database/drias.db")
 vn.connect_to_sqlite(db_vanna_path)
@@ -68,13 +69,11 @@ def ask_vanna(query):
         sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query_new_coords}. Just answer the query. The answer should not include ```sql\n").content
         print("execute sql query : ", sql_with_table_names)
         db = sqlite3.connect(db_vanna_path)
-        # if "lat" not in sql_with_table_names:
-        # sql_with_table_names = sql_with_table_names.replace("SELECT", "SELECT lat, lon,")
-        # result = db.cursor().execute(sql_with_table_names).fetchall()
         result = db.cursor().execute(sql_query_new_coords).fetchall()
-        # df = pd.DataFrame(result, columns = list(result_dataframe.columns))
-        # df = pd.DataFrame(result, columns=["data_name"] + list(result_dataframe.columns))
-        df = pd.DataFrame(result)
         plotly_code = vn.generate_plotly_code(
                             question="query",

 import os
 import pandas as pd
 from climateqa.engine.llm import get_llm
+import ast
 from dotenv import load_dotenv
 #Vanna object
+vn = MyVanna(config = {"temperature": 0, "api_key": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME, "top_k" : 4})
 db_vanna_path = os.path.join(os.path.dirname(__file__), "database/drias.db")
 vn.connect_to_sqlite(db_vanna_path)
         sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query_new_coords}. Just answer the query. The answer should not include ```sql\n").content
         print("execute sql query : ", sql_with_table_names)
         db = sqlite3.connect(db_vanna_path)
         result = db.cursor().execute(sql_query_new_coords).fetchall()
+        columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query_new_coords}").content
+        columns_list = ast.literal_eval(columns.strip("```python\n").strip())
+        print("column list : ",columns_list)
+        df = pd.DataFrame(result, columns=columns_list)
         plotly_code = vn.generate_plotly_code(
                             question="query",

climateqa/engine/talk_to_data/step_by_step_vanna copy.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

climateqa/engine/talk_to_data/vanna_class.py CHANGED Viewed

@@ -228,6 +228,7 @@ class MyCustomVectorDB(VannaBase):
                 "4. Please use the most relevant table(s). \n"
                 "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
                 f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
                 # "7. Add a description of the table in the result of the sql query."
                 # "7. If the question is about a specific latitude, longitude, query an interval of 0.3 and keep only the first set of coordinate. \n"
                 # "7. Table names should be included in the result of the sql query. Use for example Mean_winter_temperature AS table_name in the query \n"

                 "4. Please use the most relevant table(s). \n"
                 "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
                 f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
+                f"7. Add a description of the table in the result of the sql query, and latitude, logitude if relevant. \n"
                 # "7. Add a description of the table in the result of the sql query."
                 # "7. If the question is about a specific latitude, longitude, query an interval of 0.3 and keep only the first set of coordinate. \n"
                 # "7. Table names should be included in the result of the sql query. Use for example Mean_winter_temperature AS table_name in the query \n"

style.css CHANGED Viewed

@@ -606,3 +606,8 @@ a {
 #checkbox-config:checked {
     display: block;
 }

 #checkbox-config:checked {
     display: block;
 }
+#vanna-display {
+    height: 400px;
+    overflow-y: auto;
+}