Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Mar 13

Commit

3ca8396

verified ·

1 Parent(s): f080183

feature/add_talk_to_data (#23)

Browse files

- Add Talk to Drias (abafbcc3ab2b3f4e5429cfc802caa3e9cdada081)
- Update app.py (a7802dbc4bf7d5916dcc7155319601b30e0716eb)
- Add step by step notebooks for drias (eb90d11b1777bc9e4f2d77b931e82b17a10d16af)

Files changed (11) hide show

app.py +65 -21
climateqa/chat.py +18 -2
climateqa/engine/talk_to_data/main.py +1 -1
climateqa/engine/talk_to_data/myVanna.py +13 -0
climateqa/engine/talk_to_data/utils.py +98 -0
climateqa/engine/talk_to_data/vanna_class.py +325 -0
front/tabs/tab_papers.py +3 -1
requirements.txt +2 -0
sandbox/talk_to_data/20250306 - CQA - Drias.ipynb +82 -0
sandbox/talk_to_data/20250306 - CQA - Step_by_step_vanna.ipynb +218 -0
style.css +14 -2

app.py CHANGED Viewed

@@ -12,9 +12,11 @@ from climateqa.engine.reranker import get_reranker
 from climateqa.engine.graph import make_graph_agent,make_graph_agent_poc
 from climateqa.engine.chains.retrieve_papers import find_papers
 from climateqa.chat import start_chat, chat_stream, finish_chat
 from front.tabs import (create_config_modal, create_examples_tab, create_papers_tab, create_figures_tab, create_chat_interface, create_about_tab)
 from front.utils import process_figures
 from utils import create_user_id
@@ -67,9 +69,9 @@ vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name=os
 vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_LOCAL_V2"))
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-if os.getenv("ENV")=="GRADIO_ENV":
     reranker = get_reranker("nano")
-else:
     reranker = get_reranker("large")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0.2)
@@ -93,8 +95,9 @@ async def chat_poc(query, history, audience, sources, reports, relevant_content_
 # Function to update modal visibility
 def update_config_modal_visibility(config_open):
     new_config_visibility_status = not config_open
-    return gr.update(visible=new_config_visibility_status), new_config_visibility_status
 def update_sources_number_display(sources_textbox, figures_cards, current_graphs, papers_html):
@@ -110,7 +113,21 @@ def update_sources_number_display(sources_textbox, figures_cards, current_graphs
     return gr.update(label=recommended_content_notif_label), gr.update(label=sources_notif_label), gr.update(label=figures_notif_label), gr.update(label=graphs_notif_label), gr.update(label=papers_notif_label)
 # # UI Layout Components
 def cqa_tab(tab_name):
     # State variables
@@ -142,7 +159,7 @@ def cqa_tab(tab_name):
                             # Papers subtab
                             with gr.Tab("Papers", elem_id="tab-citations", id=4) as tab_papers:
-                                papers_summary, papers_html, citations_network, papers_modal = create_papers_tab()
                             # Graphs subtab
                             with gr.Tab("Graphs", elem_id="tab-graphs", id=5) as tab_graphs:
@@ -150,6 +167,8 @@ def cqa_tab(tab_name):
                                     "<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",
                                     elem_id="graphs-container"
                                 )
     return {
         "chatbot": chatbot,
         "textbox": textbox,
@@ -162,6 +181,7 @@ def cqa_tab(tab_name):
         "figures_cards": figures_cards,
         "gallery_component": gallery_component,
         "config_button": config_button,
         "papers_html": papers_html,
         "citations_network": citations_network,
         "papers_summary": papers_summary,
@@ -170,10 +190,23 @@ def cqa_tab(tab_name):
         "tab_figures": tab_figures,
         "tab_graphs": tab_graphs,
         "tab_papers": tab_papers,
-        "graph_container": graphs_container
     }
 def event_handling(
     main_tab_components,
@@ -190,7 +223,8 @@ def event_handling(
     sources_textbox = main_tab_components["sources_textbox"]
     figures_cards = main_tab_components["figures_cards"]
     gallery_component = main_tab_components["gallery_component"]
-    config_button = main_tab_components["config_button"]
     papers_html = main_tab_components["papers_html"]
     citations_network = main_tab_components["citations_network"]
     papers_summary = main_tab_components["papers_summary"]
@@ -200,9 +234,13 @@ def event_handling(
     tab_graphs = main_tab_components["tab_graphs"]
     tab_papers = main_tab_components["tab_papers"]
     graphs_container = main_tab_components["graph_container"]
-    config_open = config_components["config_open"]
-    config_modal = config_components["config_modal"]
     dropdown_sources = config_components["dropdown_sources"]
     dropdown_reports = config_components["dropdown_reports"]
     dropdown_external_sources = config_components["dropdown_external_sources"]
@@ -211,18 +249,18 @@ def event_handling(
     after = config_components["after"]
     output_query = config_components["output_query"]
     output_language = config_components["output_language"]
-    close_config_modal = config_components["close_config_modal_button"]
     new_sources_hmtl = gr.State([])
-    print("textbox id : ", textbox.elem_id)
-    for button in [config_button, close_config_modal]:
-        button.click(
-            fn=update_config_modal_visibility,
-            inputs=[config_open],
-            outputs=[config_modal, config_open]
-        )
     if tab_name == "ClimateQ&A":
         print("chat cqa - message sent")
@@ -265,10 +303,13 @@ def event_handling(
         component.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs, papers_html], [tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
     # Search for papers
-    for component in [textbox, examples_hidden]:
         component.submit(find_papers, [component, after, dropdown_external_sources], [papers_html, citations_network, papers_summary])
 def main_ui():
     # config_open = gr.State(True)
@@ -278,11 +319,14 @@ def main_ui():
         with gr.Tabs():
             cqa_components = cqa_tab(tab_name = "ClimateQ&A")
             local_cqa_components = cqa_tab(tab_name = "Beta - POC Adapt'Action")
             create_about_tab()
         event_handling(cqa_components, config_components, tab_name = 'ClimateQ&A')
-        event_handling(local_cqa_components, config_components, tab_name = 'Beta - POC Adapt\'Action')
         demo.queue()

 from climateqa.engine.graph import make_graph_agent,make_graph_agent_poc
 from climateqa.engine.chains.retrieve_papers import find_papers
 from climateqa.chat import start_chat, chat_stream, finish_chat
+from climateqa.engine.talk_to_data.main import ask_vanna
 from front.tabs import (create_config_modal, create_examples_tab, create_papers_tab, create_figures_tab, create_chat_interface, create_about_tab)
 from front.utils import process_figures
+from gradio_modal import Modal
 from utils import create_user_id
 vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_LOCAL_V2"))
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+if os.environ["GRADIO_ENV"] == "local":
     reranker = get_reranker("nano")
+else :
     reranker = get_reranker("large")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0.2)
 # Function to update modal visibility
 def update_config_modal_visibility(config_open):
+    print(config_open)
     new_config_visibility_status = not config_open
+    return Modal(visible=new_config_visibility_status), new_config_visibility_status
 def update_sources_number_display(sources_textbox, figures_cards, current_graphs, papers_html):
     return gr.update(label=recommended_content_notif_label), gr.update(label=sources_notif_label), gr.update(label=figures_notif_label), gr.update(label=graphs_notif_label), gr.update(label=papers_notif_label)
+def create_drias_tab():
+    with gr.Tab("Beta - Talk to DRIAS", elem_id="tab-vanna", id=6) as tab_vanna:
+        vanna_direct_question = gr.Textbox(label="Direct Question", placeholder="You can write direct question here",elem_id="direct-question", interactive=True)
+        with gr.Accordion("Details",elem_id = 'vanna-details', open=False) as vanna_details :
+            vanna_sql_query = gr.Textbox(label="SQL Query Used", elem_id="sql-query", interactive=False)
+            show_vanna_table = gr.Button("Show Table", elem_id="show-table")
+            with Modal(visible=False) as vanna_table_modal:
+                vanna_table = gr.DataFrame([], elem_id="vanna-table")
+                close_vanna_modal = gr.Button("Close", elem_id="close-vanna-modal")
+                close_vanna_modal.click(lambda: Modal(visible=False),None, [vanna_table_modal])
+            show_vanna_table.click(lambda: Modal(visible=True),None ,[vanna_table_modal])
+        vanna_display = gr.Plot()
+        vanna_direct_question.submit(ask_vanna, [vanna_direct_question], [vanna_sql_query ,vanna_table, vanna_display])
 # # UI Layout Components
 def cqa_tab(tab_name):
     # State variables
                             # Papers subtab
                             with gr.Tab("Papers", elem_id="tab-citations", id=4) as tab_papers:
+                                papers_direct_search, papers_summary, papers_html, citations_network, papers_modal = create_papers_tab()
                             # Graphs subtab
                             with gr.Tab("Graphs", elem_id="tab-graphs", id=5) as tab_graphs:
                                     "<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",
                                     elem_id="graphs-container"
                                 )
     return {
         "chatbot": chatbot,
         "textbox": textbox,
         "figures_cards": figures_cards,
         "gallery_component": gallery_component,
         "config_button": config_button,
+        "papers_direct_search" : papers_direct_search,
         "papers_html": papers_html,
         "citations_network": citations_network,
         "papers_summary": papers_summary,
         "tab_figures": tab_figures,
         "tab_graphs": tab_graphs,
         "tab_papers": tab_papers,
+        "graph_container": graphs_container,
+        # "vanna_sql_query": vanna_sql_query,
+        # "vanna_table" : vanna_table,
+        # "vanna_display": vanna_display
     }
+def config_event_handling(main_tabs_components : list[dict], config_componenets : dict):
+    config_open = config_componenets["config_open"]
+    config_modal = config_componenets["config_modal"]
+    close_config_modal = config_componenets["close_config_modal_button"]
+    for button in [close_config_modal] + [main_tab_component["config_button"] for main_tab_component in main_tabs_components]:
+        button.click(
+            fn=update_config_modal_visibility,
+            inputs=[config_open],
+            outputs=[config_modal, config_open]
+        )
 def event_handling(
     main_tab_components,
     sources_textbox = main_tab_components["sources_textbox"]
     figures_cards = main_tab_components["figures_cards"]
     gallery_component = main_tab_components["gallery_component"]
+    # config_button = main_tab_components["config_button"]
+    papers_direct_search = main_tab_components["papers_direct_search"]
     papers_html = main_tab_components["papers_html"]
     citations_network = main_tab_components["citations_network"]
     papers_summary = main_tab_components["papers_summary"]
     tab_graphs = main_tab_components["tab_graphs"]
     tab_papers = main_tab_components["tab_papers"]
     graphs_container = main_tab_components["graph_container"]
+    # vanna_sql_query = main_tab_components["vanna_sql_query"]
+    # vanna_table = main_tab_components["vanna_table"]
+    # vanna_display = main_tab_components["vanna_display"]
+    # config_open = config_components["config_open"]
+    # config_modal = config_components["config_modal"]
     dropdown_sources = config_components["dropdown_sources"]
     dropdown_reports = config_components["dropdown_reports"]
     dropdown_external_sources = config_components["dropdown_external_sources"]
     after = config_components["after"]
     output_query = config_components["output_query"]
     output_language = config_components["output_language"]
+    # close_config_modal = config_components["close_config_modal_button"]
     new_sources_hmtl = gr.State([])
+    ttd_data = gr.State([])
+    # for button in [config_button, close_config_modal]:
+    #     button.click(
+    #         fn=update_config_modal_visibility,
+    #         inputs=[config_open],
+    #         outputs=[config_modal, config_open]
+    #     )
     if tab_name == "ClimateQ&A":
         print("chat cqa - message sent")
         component.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs, papers_html], [tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
     # Search for papers
+    for component in [textbox, examples_hidden, papers_direct_search]:
         component.submit(find_papers, [component, after, dropdown_external_sources], [papers_html, citations_network, papers_summary])
+    # if tab_name == "Beta - POC Adapt'Action": # Not untill results are good enough
+    #     # Drias search
+    #     textbox.submit(ask_vanna, [textbox], [vanna_sql_query ,vanna_table, vanna_display])
 def main_ui():
     # config_open = gr.State(True)
         with gr.Tabs():
             cqa_components = cqa_tab(tab_name = "ClimateQ&A")
             local_cqa_components = cqa_tab(tab_name = "Beta - POC Adapt'Action")
+            create_drias_tab()
             create_about_tab()
         event_handling(cqa_components, config_components, tab_name = 'ClimateQ&A')
+        event_handling(local_cqa_components, config_components, tab_name = "Beta - POC Adapt'Action")
+        config_event_handling([cqa_components,local_cqa_components] ,config_components)
         demo.queue()

climateqa/chat.py CHANGED Viewed

@@ -53,6 +53,13 @@ def log_interaction_to_azure(history, output_query, sources, docs, share_client,
         print(f"Error logging on Azure Blob Storage: {e}")
         error_msg = f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
         raise gr.Error(error_msg)
 # Main chat function
 async def chat_stream(
@@ -121,6 +128,7 @@ async def chat_stream(
     used_documents = []
     retrieved_contents = []
     answer_message_content = ""
     # Define processing steps
     steps_display = {
@@ -142,6 +150,14 @@ async def chat_stream(
                     history, used_documents, retrieved_contents = handle_retrieved_documents(
                         event, history, used_documents, retrieved_contents
                     )
                 if event["event"] == "on_chain_end" and event["name"] == "answer_search" :
                     docs = event["data"]["input"]["documents"]
                     docs_html = convert_to_docs_to_html(docs)
@@ -184,7 +200,7 @@ async def chat_stream(
                         sub_questions = [q["question"] + "-> relevant sources : " + str(q["sources"]) for q in event["data"]["output"]["questions_list"]]
                         history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
-            yield history, docs_html, output_query, output_language, related_contents, graphs_html
     except Exception as e:
         print(f"Event {event} has failed")
@@ -195,4 +211,4 @@ async def chat_stream(
     # Call the function to log interaction
     log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id)
-    yield history, docs_html, output_query, output_language, related_contents, graphs_html

         print(f"Error logging on Azure Blob Storage: {e}")
         error_msg = f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
         raise gr.Error(error_msg)
+def handle_numerical_data(event):
+    if event["name"] == "retrieve_drias_data" and event["event"] == "on_chain_end":
+        numerical_data = event["data"]["output"]["drias_data"]
+        sql_query = event["data"]["output"]["drias_sql_query"]
+        return numerical_data, sql_query
+    return None, None
 # Main chat function
 async def chat_stream(
     used_documents = []
     retrieved_contents = []
     answer_message_content = ""
+    vanna_data = {}
     # Define processing steps
     steps_display = {
                     history, used_documents, retrieved_contents = handle_retrieved_documents(
                         event, history, used_documents, retrieved_contents
                     )
+                # Handle Vanna retrieval
+                # if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
+                #     df_output_vanna, sql_query = handle_numerical_data(
+                #         event
+                #     )
+                #     vanna_data = {"df_output": df_output_vanna, "sql_query": sql_query}
                 if event["event"] == "on_chain_end" and event["name"] == "answer_search" :
                     docs = event["data"]["input"]["documents"]
                     docs_html = convert_to_docs_to_html(docs)
                         sub_questions = [q["question"] + "-> relevant sources : " + str(q["sources"]) for q in event["data"]["output"]["questions_list"]]
                         history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
+            yield history, docs_html, output_query, output_language, related_contents, graphs_html#, vanna_data
     except Exception as e:
         print(f"Event {event} has failed")
     # Call the function to log interaction
     log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id)
+    yield history, docs_html, output_query, output_language, related_contents, graphs_html#, vanna_data

climateqa/engine/talk_to_data/main.py CHANGED Viewed

@@ -19,7 +19,7 @@ VANNA_MODEL = os.getenv('VANNA_MODEL')
 #Vanna object
 vn = MyVanna(config = {"temperature": 0, "api_key": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME, "top_k" : 4})
-db_vanna_path = os.path.join(os.path.dirname(__file__), "database/drias.db")
 vn.connect_to_sqlite(db_vanna_path)
 llm = get_llm(provider="openai")

 #Vanna object
 vn = MyVanna(config = {"temperature": 0, "api_key": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME, "top_k" : 4})
+db_vanna_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))), "data/drias/drias.db")
 vn.connect_to_sqlite(db_vanna_path)
 llm = get_llm(provider="openai")

climateqa/engine/talk_to_data/myVanna.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from dotenv import load_dotenv
+from climateqa.engine.talk_to_data.vanna_class import MyCustomVectorDB
+from vanna.openai import OpenAI_Chat
+import os
+load_dotenv()
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+class MyVanna(MyCustomVectorDB, OpenAI_Chat):
+    def __init__(self, config=None):
+        MyCustomVectorDB.__init__(self, config=config)
+        OpenAI_Chat.__init__(self, config=config)

climateqa/engine/talk_to_data/utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import re
+import openai
+import pandas as pd
+from geopy.geocoders import Nominatim
+import sqlite3
+import ast
+def detect_location_with_openai(api_key, sentence):
+    """
+    Detects locations in a sentence using OpenAI's API.
+    """
+    openai.api_key = api_key
+    prompt = f"""
+    Extract all locations (cities, countries, states, or geographical areas) mentioned in the following sentence.
+    Return the result as a Python list. If no locations are mentioned, return an empty list.
+    Sentence: "{sentence}"
+    """
+    response = openai.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant skilled in identifying locations in text."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=100,
+        temperature=0
+    )
+    return response.choices[0].message.content.split("\n")[1][2:-2]
+def detectTable(sql_query):
+    pattern = r'(?i)\bFROM\s+((?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+)(?:\.(?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+))*)'
+    matches = re.findall(pattern, sql_query)
+    return matches
+def loc2coords(location : str):
+    geolocator = Nominatim(user_agent="city_to_latlong")
+    location = geolocator.geocode(location)
+    return (location.latitude, location.longitude)
+def coords2loc(coords : tuple):
+    geolocator = Nominatim(user_agent="coords_to_city")
+    try:
+        location = geolocator.reverse(coords)
+        return location.address
+    except Exception as e:
+        print(f"Error: {e}")
+        return "Unknown Location"
+def nearestNeighbourSQL(db: str, location: tuple, table : str):
+    conn = sqlite3.connect(db)
+    long = round(location[1], 3)
+    lat = round(location[0], 3)
+    cursor  = conn.cursor()
+    cursor.execute(f"SELECT lat, lon FROM {table} WHERE lat BETWEEN {lat - 0.3} AND {lat + 0.3} AND lon BETWEEN {long - 0.3} AND {long + 0.3}")
+    results = cursor.fetchall()
+    return results[0]
+def detect_relevant_tables(user_question, llm):
+    table_names_list = [
+        "Frequency_of_rainy_days_index",
+        "Winter_precipitation_total",
+        "Summer_precipitation_total",
+        "Annual_precipitation_total",
+        # "Remarkable_daily_precipitation_total_(Q99)",
+        "Frequency_of_remarkable_daily_precipitation",
+        "Extreme_precipitation_intensity",
+        "Mean_winter_temperature",
+        "Mean_summer_temperature",
+        "Number_of_tropical_nights",
+        "Maximum_summer_temperature",
+        "Number_of_days_with_Tx_above_30C",
+        "Number_of_days_with_Tx_above_35C",
+        "Drought_index"
+    ]
+    prompt = (
+        f"You are helping to build a sql query to retrieve relevant data for a user question."
+        f"The different tables are {table_names_list}."
+        f"The user question is {user_question}. Write the relevant tables to use. Answer only a python list of table name."
+    )
+    table_names = ast.literal_eval(llm.invoke(prompt).content.strip("```python\n").strip())
+    return table_names
+def replace_coordonates(coords, query, coords_tables):
+    n = query.count(str(coords[0]))
+    for i in range(n):
+        query = query.replace(str(coords[0]), str(coords_tables[i][0]),1)
+        query = query.replace(str(coords[1]), str(coords_tables[i][1]),1)
+    return query

climateqa/engine/talk_to_data/vanna_class.py ADDED Viewed

	@@ -0,0 +1,325 @@

+from vanna.base import VannaBase
+from pinecone import Pinecone
+from climateqa.engine.embeddings import get_embeddings_function
+import pandas as pd
+import hashlib
+class MyCustomVectorDB(VannaBase):
+    """
+    VectorDB class for storing and retrieving vectors from Pinecone.
+    args :
+        config (dict) : Configuration dictionary containing the Pinecone API key and the index name :
+            - pc_api_key (str) : Pinecone API key
+            - index_name (str) : Pinecone index name
+            - top_k (int) : Number of top results to return (default = 2)
+    """
+    def __init__(self,config):
+        super().__init__(config = config)
+        try :
+            self.api_key = config.get('pc_api_key')
+            self.index_name = config.get('index_name')
+        except :
+            raise Exception("Please provide the Pinecone API key and the index name")
+        self.pc = Pinecone(api_key = self.api_key)
+        self.index = self.pc.Index(self.index_name)
+        self.top_k = config.get('top_k', 2)
+        self.embeddings = get_embeddings_function()
+    def check_embedding(self, id, namespace):
+        fetched = self.index.fetch(ids = [id], namespace = namespace)
+        if fetched['vectors'] == {}:
+            return False
+        return True
+    def generate_hash_id(self, data: str) -> str:
+        """
+        Generate a unique hash ID for the given data.
+        Args:
+            data (str): The input data to hash (e.g., a concatenated string of user attributes).
+        Returns:
+            str: A unique hash ID as a hexadecimal string.
+        """
+        data_bytes = data.encode('utf-8')
+        hash_object = hashlib.sha256(data_bytes)
+        hash_id = hash_object.hexdigest()
+        return hash_id
+    def add_ddl(self, ddl: str, **kwargs) -> str:
+        id = self.generate_hash_id(ddl) + '_ddl'
+        if self.check_embedding(id, 'ddl'):
+            print(f"DDL having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(ddl), {'ddl': ddl})],
+            namespace = 'ddl'
+        )
+        return id
+    def add_documentation(self, doc: str, **kwargs) -> str:
+        id = self.generate_hash_id(doc) + '_doc'
+        if self.check_embedding(id, 'documentation'):
+            print(f"Documentation having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(doc), {'doc': doc})],
+            namespace = 'documentation'
+        )
+        return id
+    def add_question_sql(self, question: str, sql: str, **kwargs) -> str:
+        id = self.generate_hash_id(question) + '_sql'
+        if self.check_embedding(id, 'question_sql'):
+            print(f"Question-SQL pair having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(question + sql), {'question': question, 'sql': sql})],
+            namespace = 'question_sql'
+        )
+        return id
+    def get_related_ddl(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='ddl',
+            include_metadata=True
+        )
+        return [match['metadata']['ddl'] for match in res['matches']]
+    def get_related_documentation(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='documentation',
+            include_metadata=True
+        )
+        return [match['metadata']['doc'] for match in res['matches']]
+    def get_similar_question_sql(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='question_sql',
+            include_metadata=True
+        )
+        return [(match['metadata']['question'], match['metadata']['sql']) for match in res['matches']]
+    def get_training_data(self, **kwargs) -> pd.DataFrame:
+        list_of_data = []
+        namespaces = ['ddl', 'documentation', 'question_sql']
+        for namespace in namespaces:
+            data = self.index.query(
+            top_k=10000,
+            namespace=namespace,
+            include_metadata=True,
+            include_values=False
+            )
+            for match in data['matches']:
+                list_of_data.append(match['metadata'])
+        return pd.DataFrame(list_of_data)
+    def remove_training_data(self, id: str, **kwargs) -> bool:
+        if id.endswith("_ddl"):
+            self.Index.delete(ids=[id], namespace="_ddl")
+            return True
+        if id.endswith("_sql"):
+            self.index.delete(ids=[id], namespace="_sql")
+            return True
+        if id.endswith("_doc"):
+            self.Index.delete(ids=[id], namespace="_doc")
+            return True
+        return False
+    def generate_embedding(self, text, **kwargs):
+        # Implement the method here
+        pass
+    def get_sql_prompt(
+            self,
+            initial_prompt : str,
+            question: str,
+            question_sql_list: list,
+            ddl_list: list,
+            doc_list: list,
+            **kwargs,
+        ):
+            """
+            Example:
+            ```python
+            vn.get_sql_prompt(
+                question="What are the top 10 customers by sales?",
+                question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
+                ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
+                doc_list=["The customers table contains information about customers and their sales."],
+            )
+            ```
+            This method is used to generate a prompt for the LLM to generate SQL.
+            Args:
+                question (str): The question to generate SQL for.
+                question_sql_list (list): A list of questions and their corresponding SQL statements.
+                ddl_list (list): A list of DDL statements.
+                doc_list (list): A list of documentation.
+            Returns:
+                any: The prompt for the LLM to generate SQL.
+            """
+            if initial_prompt is None:
+                initial_prompt = f"You are a {self.dialect} expert. " + \
+                "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
+            initial_prompt = self.add_ddl_to_prompt(
+                initial_prompt, ddl_list, max_tokens=self.max_tokens
+            )
+            if self.static_documentation != "":
+                doc_list.append(self.static_documentation)
+            initial_prompt = self.add_documentation_to_prompt(
+                initial_prompt, doc_list, max_tokens=self.max_tokens
+            )
+            # initial_prompt = self.add_sql_to_prompt(
+            #     initial_prompt, question_sql_list, max_tokens=self.max_tokens
+            # )
+            initial_prompt += (
+                "===Response Guidelines \n"
+                "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
+                "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
+                "3. If the provided context is insufficient, please give a sql query based on your knowledge and the context provided. \n"
+                "4. Please use the most relevant table(s). \n"
+                "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
+                f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
+                f"7. Add a description of the table in the result of the sql query, if relevant. \n"
+                "8 Make sure to include the relevant KPI in the SQL query. The query should return impactfull data \n"
+                # f"8. If a set of latitude,longitude is provided, make a intermediate query to find the nearest value in the table and replace the coordinates in the sql query. \n"
+                # "7. Add a description of the table in the result of the sql query."
+                # "7. If the question is about a specific latitude, longitude, query an interval of 0.3 and keep only the first set of coordinate. \n"
+                # "7. Table names should be included in the result of the sql query. Use for example Mean_winter_temperature AS table_name in the query \n"
+            )
+            message_log = [self.system_message(initial_prompt)]
+            for example in question_sql_list:
+                if example is None:
+                    print("example is None")
+                else:
+                    if example is not None and "question" in example and "sql" in example:
+                        message_log.append(self.user_message(example["question"]))
+                        message_log.append(self.assistant_message(example["sql"]))
+            message_log.append(self.user_message(question))
+            return message_log
+# def get_sql_prompt(
+#         self,
+#         initial_prompt : str,
+#         question: str,
+#         question_sql_list: list,
+#         ddl_list: list,
+#         doc_list: list,
+#         **kwargs,
+#     ):
+#         """
+#         Example:
+#         ```python
+#         vn.get_sql_prompt(
+#             question="What are the top 10 customers by sales?",
+#             question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
+#             ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
+#             doc_list=["The customers table contains information about customers and their sales."],
+#         )
+#         ```
+#         This method is used to generate a prompt for the LLM to generate SQL.
+#         Args:
+#             question (str): The question to generate SQL for.
+#             question_sql_list (list): A list of questions and their corresponding SQL statements.
+#             ddl_list (list): A list of DDL statements.
+#             doc_list (list): A list of documentation.
+#         Returns:
+#             any: The prompt for the LLM to generate SQL.
+#         """
+#         if initial_prompt is None:
+#             initial_prompt = f"You are a {self.dialect} expert. " + \
+#             "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
+#         initial_prompt = self.add_ddl_to_prompt(
+#             initial_prompt, ddl_list, max_tokens=self.max_tokens
+#         )
+#         if self.static_documentation != "":
+#             doc_list.append(self.static_documentation)
+#         initial_prompt = self.add_documentation_to_prompt(
+#             initial_prompt, doc_list, max_tokens=self.max_tokens
+#         )
+#         initial_prompt += (
+#             "===Response Guidelines \n"
+#             "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
+#             "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
+#             "3. If the provided context is insufficient, please explain why it can't be generated. \n"
+#             "4. Please use the most relevant table(s). \n"
+#             "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
+#             f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
+#         )
+#         message_log = [self.system_message(initial_prompt)]
+#         for example in question_sql_list:
+#             if example is None:
+#                 print("example is None")
+#             else:
+#                 if example is not None and "question" in example and "sql" in example:
+#                     message_log.append(self.user_message(example["question"]))
+#                     message_log.append(self.assistant_message(example["sql"]))
+#         message_log.append(self.user_message(question))
+#         return message_log

front/tabs/tab_papers.py CHANGED Viewed

@@ -3,6 +3,8 @@ from gradio_modal import Modal
 def create_papers_tab():
     with gr.Accordion(
         visible=True,
         elem_id="papers-summary-popup",
@@ -32,5 +34,5 @@ def create_papers_tab():
         papers_modal
     )
-    return papers_summary, papers_html, citations_network, papers_modal

 def create_papers_tab():
+    direct_search_textbox = gr.Textbox(label="Direct search for papers", placeholder= "What is climate change ?", elem_id="papers-search")
     with gr.Accordion(
         visible=True,
         elem_id="papers-summary-popup",
         papers_modal
     )
+    return direct_search_textbox, papers_summary, papers_html, citations_network, papers_modal

requirements.txt CHANGED Viewed

@@ -19,3 +19,5 @@ langchain-community==0.2
 msal==1.31
 matplotlib==3.9.2
 gradio-modal==0.0.4

 msal==1.31
 matplotlib==3.9.2
 gradio-modal==0.0.4
+vanna==0.7.5
+geopy==2.4.1

sandbox/talk_to_data/20250306 - CQA - Drias.ipynb ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import the function in main.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from climateqa.engine.talk_to_data.main import ask_vanna\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a human query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"Comment vont évoluer les températures à marseille ?\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Call the function ask vanna, it gives an output of a the sql query and the dataframe of the result (tuple)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sql_query, df, fig = ask_vanna(query)\n",
+    "print(df.head())\n",
+    "fig.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "climateqa",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sandbox/talk_to_data/20250306 - CQA - Step_by_step_vanna.ipynb ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from climateqa.engine.talk_to_data.main import ask_vanna\n",
+    "\n",
+    "import sqlite3\n",
+    "import os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from climateqa.engine.talk_to_data.myVanna import MyVanna\n",
+    "from climateqa.engine.talk_to_data.utils import loc2coords, detect_location_with_openai, detectTable, nearestNeighbourSQL, detect_relevant_tables, replace_coordonates#,nearestNeighbourPostgres\n",
+    "\n",
+    "from climateqa.engine.llm import get_llm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Vanna Ask\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "llm = get_llm(provider=\"openai\")\n",
+    "\n",
+    "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n",
+    "PC_API_KEY = os.getenv('VANNA_PINECONE_API_KEY')\n",
+    "INDEX_NAME = os.getenv('VANNA_INDEX_NAME')\n",
+    "VANNA_MODEL = os.getenv('VANNA_MODEL')\n",
+    "\n",
+    "ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))\n",
+    "\n",
+    "#Vanna object\n",
+    "vn = MyVanna(config = {\"temperature\": 0, \"api_key\": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME, \"top_k\" : 4})\n",
+    "\n",
+    "db_vanna_path = ROOT_PATH + \"/data/drias/drias.db\"\n",
+    "vn.connect_to_sqlite(db_vanna_path)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# User query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"Quelle sera la température à Marseille sur les prochaines années ?\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Detect location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "location = detect_location_with_openai(OPENAI_API_KEY, query)\n",
+    "print(location)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Convert location to longitude, latitude coordonate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coords = loc2coords(location)\n",
+    "user_input = query.lower().replace(location.lower(), f\"lat, long : {coords}\")\n",
+    "print(user_input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Find closest coordonates and replace lat,lon\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "relevant_tables = detect_relevant_tables(user_input, llm)            \n",
+    "coords_tables = [nearestNeighbourSQL(db_vanna_path, coords, relevant_tables[i]) for i in range(len(relevant_tables))]\n",
+    "user_input_with_coords = replace_coordonates(coords, user_input, coords_tables)\n",
+    "print(user_input_with_coords)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ask Vanna with correct coordonates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user_input_with_coords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sql_query, result_dataframe, figure = vn.ask(user_input_with_coords, print_results=False, allow_llm_to_see_data=True, auto_train=False)\n",
+    "print(result_dataframe.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "climateqa",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

style.css CHANGED Viewed

@@ -481,14 +481,13 @@ a {
         max-height: calc(100vh - 190px) !important;
         overflow: hidden;
     }
     div#tab-examples,
     div#sources-textbox,
     div#tab-config {
         height: calc(100vh - 190px) !important;
         overflow-y: scroll !important;
     }
     div#sources-figures,
     div#graphs-container,
     div#tab-citations {
@@ -606,3 +605,16 @@ a {
 #checkbox-config:checked {
     display: block;
 }

         max-height: calc(100vh - 190px) !important;
         overflow: hidden;
     }
     div#tab-examples,
     div#sources-textbox,
     div#tab-config {
         height: calc(100vh - 190px) !important;
         overflow-y: scroll !important;
     }
+    div#tab-vanna,
     div#sources-figures,
     div#graphs-container,
     div#tab-citations {
 #checkbox-config:checked {
     display: block;
 }
+#vanna-display {
+    max-height: 300px;
+    /* overflow-y: scroll; */
+}
+#sql-query{
+    max-height: 100px;
+    overflow-y:scroll;
+}
+#vanna-details{
+    max-height: 500px;
+    overflow-y:scroll;
+}