Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

add_drias_talk_to_data

#19

by timeki - opened 15 days ago

base: refs/heads/main

←

from: refs/pr/19

Discussion Files changed

+3277

-1851

Files changed (36) hide show

.gitignore +7 -0
app.py +241 -596
climateqa/chat.py +214 -0
climateqa/constants.py +2 -0
climateqa/engine/chains/answer_rag.py +3 -2
climateqa/engine/chains/drias_retriever.py +16 -0
climateqa/engine/chains/graph_retriever.py +3 -1
climateqa/engine/chains/intent_categorization.py +2 -1
climateqa/engine/chains/prompts.py +31 -3
climateqa/engine/chains/query_transformation.py +168 -69
climateqa/engine/chains/retrieve_documents.py +254 -58
climateqa/engine/chains/retrieve_papers.py +2 -2
climateqa/engine/graph.py +154 -27
climateqa/engine/reranker.py +5 -0
climateqa/engine/talk_to_data/main.py +60 -0
climateqa/engine/talk_to_data/myVanna.py +13 -0
climateqa/engine/talk_to_data/utils.py +98 -0
climateqa/engine/talk_to_data/vanna_class.py +325 -0
climateqa/{event_handler.py → handle_stream_events.py} +15 -12
front/deprecated.py +46 -0
front/event_listeners.py +0 -0
front/tabs/__init__.py +6 -0
front/tabs/chat_interface.py +74 -0
front/tabs/main_tab.py +68 -0
front/tabs/tab_about.py +38 -0
front/tabs/tab_config.py +123 -0
front/tabs/tab_examples.py +40 -0
front/tabs/tab_figures.py +31 -0
front/tabs/tab_papers.py +38 -0
front/tabs/tab_recommended_content.py +0 -0
front/utils.py +25 -15
requirements.txt +3 -1
sandbox/20241104 - CQA - StepByStep CQA.ipynb +0 -0
sandbox/talk_to_data/20250306 - CQA - Drias.ipynb +94 -0
sandbox/talk_to_data/20250306 - CQA - Step_by_step_vanna.ipynb +201 -0
style.css +406 -488

.gitignore CHANGED Viewed

@@ -11,3 +11,10 @@ notebooks/
 data/
 sandbox/

 data/
 sandbox/
+climateqa/talk_to_data/database/
+*.db
+data_ingestion/
+.vscode
+*old/

app.py CHANGED Viewed

@@ -1,54 +1,32 @@
-from climateqa.engine.embeddings import get_embeddings_function
-embeddings_function = get_embeddings_function()
-from sentence_transformers import CrossEncoder
-# reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
-import gradio as gr
-from gradio_modal import Modal
-import pandas as pd
-import numpy as np
 import os
-import time
-import re
-import json
-from gradio import ChatMessage
-# from gradio_modal import Modal
-from io import BytesIO
-import base64
-from datetime import datetime
 from azure.storage.fileshare import ShareServiceClient
-from utils import create_user_id
 from gradio_modal import Modal
-from PIL import Image
-from langchain_core.runnables.schema import StreamEvent
-# ClimateQ&A imports
-from climateqa.engine.llm import get_llm
-from climateqa.engine.vectorstore import get_pinecone_vectorstore
-# from climateqa.knowledge.retriever import ClimateQARetriever
-from climateqa.engine.reranker import get_reranker
-from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.chains.prompts import audience_prompts
-from climateqa.sample_questions import QUESTIONS
-from climateqa.constants import POSSIBLE_REPORTS, OWID_CATEGORIES
-from climateqa.utils import get_image_from_azure_blob_storage
-from climateqa.engine.graph import make_graph_agent
-from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.chains.retrieve_papers import find_papers
-from front.utils import serialize_docs,process_figures
-from climateqa.event_handler import init_audience, handle_retrieved_documents, stream_answer,handle_retrieved_owid_graphs
 # Load environment variables in local mode
 try:
@@ -57,7 +35,6 @@ try:
 except Exception as e:
     pass
-import requests
 # Set up Gradio Theme
 theme = gr.themes.Base(
@@ -66,15 +43,7 @@ theme = gr.themes.Base(
     font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
 )
-init_prompt = ""
-system_template = {
-    "role": "system",
-    "content": init_prompt,
-}
 account_key = os.environ["BLOB_ACCOUNT_KEY"]
 if len(account_key) == 86:
     account_key += "=="
@@ -92,586 +61,262 @@ share_client = service.get_share_client(file_share_name)
 user_id = create_user_id()
-CITATION_LABEL = "BibTeX citation for ClimateQ&A"
-CITATION_TEXT = r"""@misc{climateqa,
-    author={Théo Alves Da Costa, Timothée Bohe},
-    title={ClimateQ&A, AI-powered conversational assistant for climate change and biodiversity loss},
-    year={2024},
-    howpublished= {\url{https://climateqa.com}},
-}
-@software{climateqa,
-    author = {Théo Alves Da Costa, Timothée Bohe},
-    publisher = {ClimateQ&A},
-    title = {ClimateQ&A, AI-powered conversational assistant for climate change and biodiversity loss},
-}
-"""
 # Create vectorstore and retriever
-vectorstore = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv("PINECONE_API_INDEX"))
-vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-reranker = get_reranker("nano")
-agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
-def update_config_modal_visibility(config_open):
-    new_config_visibility_status = not config_open
-    return gr.update(visible=new_config_visibility_status), new_config_visibility_status
-async def chat(query, history, audience, sources, reports, relevant_content_sources, search_only):
-    """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
-    (messages in gradio format, messages in langchain format, source documents)"""
-    date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    print(f">> NEW QUESTION ({date_now}) : {query}")
-    audience_prompt = init_audience(audience)
-    # Prepare default values
-    if sources is None or len(sources) == 0:
-        sources = ["IPCC", "IPBES", "IPOS"]
-    if reports is None or len(reports) == 0:
-        reports = []
-    inputs = {"user_input": query,"audience": audience_prompt,"sources_input":sources, "relevant_content_sources" : relevant_content_sources, "search_only": search_only}
-    result = agent.astream_events(inputs,version = "v1")
-    docs = []
-    used_figures=[]
-    related_contents = []
-    docs_html = ""
-    output_query = ""
-    output_language = ""
-    output_keywords = ""
-    start_streaming = False
-    graphs_html = ""
-    figures = '<div class="figures-container"><p></p> </div>'
-    steps_display = {
-        "categorize_intent":("🔄️ Analyzing user message",True),
-        "transform_query":("🔄️ Thinking step by step to answer the question",True),
-        "retrieve_documents":("🔄️ Searching in the knowledge base",False),
-    }
-    used_documents = []
-    answer_message_content = ""
-    try:
-        async for event in result:
-            if "langgraph_node" in event["metadata"]:
-                node = event["metadata"]["langgraph_node"]
-                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
-                    docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(event, history, used_documents)
-                elif event["event"] == "on_chain_end" and node == "categorize_intent" and event["name"] == "_write": # when the query is transformed
-                    intent = event["data"]["output"]["intent"]
-                    if "language" in event["data"]["output"]:
-                        output_language = event["data"]["output"]["language"]
-                    else :
-                        output_language = "English"
-                    history[-1].content = f"Language identified : {output_language} \n Intent identified : {intent}"
-                elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
-                    event_description, display_output = steps_display[node]
-                    if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
-                        history.append(ChatMessage(role="assistant", content = "", metadata={'title' :event_description}))
-                elif event["name"] != "transform_query" and event["event"] == "on_chat_model_stream" and node in ["answer_rag", "answer_search","answer_chitchat"]:# if streaming answer
-                    history, start_streaming, answer_message_content = stream_answer(history, event, start_streaming, answer_message_content)
-                elif event["name"] in ["retrieve_graphs", "retrieve_graphs_ai"] and event["event"] == "on_chain_end":
-                    graphs_html = handle_retrieved_owid_graphs(event, graphs_html)
-                if event["name"] == "transform_query" and event["event"] =="on_chain_end":
-                    if hasattr(history[-1],"content"):
-                        history[-1].content += "Decompose question into sub-questions: \n\n - " + "\n - ".join([q["question"] for q in event["data"]["output"]["remaining_questions"]])
-                if event["name"] == "categorize_intent" and event["event"] == "on_chain_start":
-                    print("X")
-            yield history, docs_html, output_query, output_language, related_contents , graphs_html,  #,output_query,output_keywords
-    except Exception as e:
-        print(event, "has failed")
-        raise gr.Error(f"{e}")
-    try:
-        # Log answer on Azure Blob Storage
-        if os.getenv("GRADIO_ENV") != "local":
-            timestamp = str(datetime.now().timestamp())
-            file = timestamp + ".json"
-            prompt = history[1]["content"]
-            logs = {
-                "user_id": str(user_id),
-                "prompt": prompt,
-                "query": prompt,
-                "question":output_query,
-                "sources":sources,
-                "docs":serialize_docs(docs),
-                "answer": history[-1].content,
-                "time": timestamp,
-            }
-            log_on_azure(file, logs, share_client)
-    except Exception as e:
-        print(f"Error logging on Azure Blob Storage: {e}")
-        raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)")
-    yield history, docs_html, output_query, output_language, related_contents, graphs_html
-def save_feedback(feed: str, user_id):
-    if len(feed) > 1:
-        timestamp = str(datetime.now().timestamp())
-        file = user_id + timestamp + ".json"
-        logs = {
-            "user_id": user_id,
-            "feedback": feed,
-            "time": timestamp,
-        }
-        log_on_azure(file, logs, share_client)
-        return "Feedback submitted, thank you!"
-def log_on_azure(file, logs, share_client):
-    logs = json.dumps(logs)
-    file_client = share_client.get_file_client(file)
-    file_client.upload_file(logs)
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------
-init_prompt = """
-Hello, I am ClimateQ&A, a conversational assistant designed to help you understand climate change and biodiversity loss. I will answer your questions by **sifting through the IPCC and IPBES scientific reports**.
-❓ How to use
-- **Language**: You can ask me your questions in any language.
-- **Audience**: You can specify your audience (children, general public, experts) to get a more adapted answer.
-- **Sources**: You can choose to search in the IPCC or IPBES reports, or both.
-- **Relevant content sources**: You can choose to search for figures, papers, or graphs that can be relevant for your question.
-⚠️ Limitations
-*Please note that the AI is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
-🛈 Information
-Please note that we log your questions for meta-analysis purposes, so avoid sharing any sensitive or personal information.
-What do you want to learn ?
-"""
-def vote(data: gr.LikeData):
-    if data.liked:
-        print(data.value)
-    else:
-        print(data)
-def save_graph(saved_graphs_state, embedding, category):
-    print(f"\nCategory:\n{saved_graphs_state}\n")
-    if category not in saved_graphs_state:
-        saved_graphs_state[category] = []
-    if embedding not in saved_graphs_state[category]:
-        saved_graphs_state[category].append(embedding)
-    return saved_graphs_state, gr.Button("Graph Saved")
-with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=theme,elem_id = "main-component") as demo:
-    chat_completed_state = gr.State(0)
     current_graphs = gr.State([])
-    saved_graphs = gr.State({})
-    config_open = gr.State(False)
-    with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
-                chatbot = gr.Chatbot(
-                    value = [ChatMessage(role="assistant", content=init_prompt)],
-                    type = "messages",
-                    show_copy_button=True,
-                    show_label = False,
-                    elem_id="chatbot",
-                    layout = "panel",
-                    avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
-                    max_height="80vh",
-                    height="100vh"
-                )
-                # bot.like(vote,None,None)
-                with gr.Row(elem_id = "input-message"):
-                    textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
-                    config_button = gr.Button("",elem_id="config-button")
-                    # config_checkbox_button = gr.Checkbox(label = '⚙️', value="show",visible=True, interactive=True, elem_id="checkbox-config")
-            with gr.Column(scale=2, variant="panel",elem_id = "right-panel"):
-                with gr.Tabs(elem_id = "right_panel_tab") as tabs:
-                    with gr.TabItem("Examples",elem_id = "tab-examples",id = 0):
-                        examples_hidden = gr.Textbox(visible = False)
-                        first_key = list(QUESTIONS.keys())[0]
-                        dropdown_samples = gr.Dropdown(QUESTIONS.keys(),value = first_key,interactive = True,show_label = True,label = "Select a category of sample questions",elem_id = "dropdown-samples")
-                        samples = []
-                        for i,key in enumerate(QUESTIONS.keys()):
-                            examples_visible = True if i == 0 else False
-                            with gr.Row(visible = examples_visible) as group_examples:
-                                examples_questions = gr.Examples(
-                                    QUESTIONS[key],
-                                    [examples_hidden],
-                                    examples_per_page=8,
-                                    run_on_click=False,
-                                    elem_id=f"examples{i}",
-                                    api_name=f"examples{i}",
-                                    # label = "Click on the example question or enter your own",
-                                    # cache_examples=True,
-                                )
-                            samples.append(group_examples)
-                    # with gr.Tab("Configuration", id = 10, ) as tab_config:
-                    #         # gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
-                    #     pass
-                            # with gr.Row():
-                            #     dropdown_sources = gr.CheckboxGroup(
-                            #         ["IPCC", "IPBES","IPOS"],
-                            #         label="Select source",
-                            #         value=["IPCC"],
-                            #         interactive=True,
-                            #     )
-                            #     dropdown_external_sources = gr.CheckboxGroup(
-                            #         ["IPCC figures","OpenAlex", "OurWorldInData"],
-                            #         label="Select database to search for relevant content",
-                            #         value=["IPCC figures"],
-                            #         interactive=True,
-                            #     )
-                            # dropdown_reports = gr.Dropdown(
-                            #     POSSIBLE_REPORTS,
-                            #     label="Or select specific reports",
-                            #     multiselect=True,
-                            #     value=None,
-                            #     interactive=True,
-                            # )
-                            # search_only = gr.Checkbox(label="Search only without chating", value=False, interactive=True, elem_id="checkbox-chat")
-                            # dropdown_audience = gr.Dropdown(
-                            #     ["Children","General public","Experts"],
-                            #     label="Select audience",
-                            #     value="Experts",
-                            #     interactive=True,
-                            # )
-                            # after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers", visible=False)
-                            # output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False, visible= False)
-                            # output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False, visible= False)
-                            # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
-                            # # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
-                    with gr.Tab("Sources",elem_id = "tab-sources",id = 1) as tab_sources:
-                        sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
-                    with gr.Tab("Recommended content", elem_id="tab-recommended_content",id=2) as tab_recommended_content:
-                        with gr.Tabs(elem_id = "group-subtabs") as tabs_recommended_content:
-                            with gr.Tab("Figures",elem_id = "tab-figures",id = 3) as tab_figures:
-                                sources_raw = gr.State()
-                                with Modal(visible=False, elem_id="modal_figure_galery") as figure_modal:
-                                    gallery_component = gr.Gallery(object_fit='scale-down',elem_id="gallery-component", height="80vh")
-                                show_full_size_figures = gr.Button("Show figures in full size",elem_id="show-figures",interactive=True)
-                                show_full_size_figures.click(lambda : Modal(visible=True),None,figure_modal)
-                                figures_cards = gr.HTML(show_label=False, elem_id="sources-figures")
-                            with gr.Tab("Papers",elem_id = "tab-citations",id = 4) as tab_papers:
-                                # btn_summary = gr.Button("Summary")
-                                # Fenêtre simulée pour le Summary
-                                with gr.Accordion(visible=True, elem_id="papers-summary-popup", label= "See summary of relevant papers", open= False) as summary_popup:
-                                    papers_summary = gr.Markdown("", visible=True, elem_id="papers-summary")
-                                # btn_relevant_papers = gr.Button("Relevant papers")
-                                # Fenêtre simulée pour les Relevant Papers
-                                with gr.Accordion(visible=True, elem_id="papers-relevant-popup",label= "See relevant papers", open= False) as relevant_popup:
-                                    papers_html = gr.HTML(show_label=False, elem_id="papers-textbox")
-                                btn_citations_network = gr.Button("Explore papers citations network")
-                                # Fenêtre simulée pour le Citations Network
-                                with Modal(visible=False) as papers_modal:
-                                    citations_network = gr.HTML("<h3>Citations Network Graph</h3>", visible=True, elem_id="papers-citations-network")
-                                btn_citations_network.click(lambda: Modal(visible=True), None, papers_modal)
                             with gr.Tab("Graphs", elem_id="tab-graphs", id=5) as tab_graphs:
-                                graphs_container = gr.HTML("<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",elem_id="graphs-container")
-                                current_graphs.change(lambda x : x, inputs=[current_graphs], outputs=[graphs_container])
-            with Modal(visible=False,elem_id="modal-config") as config_modal:
-                gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
-                # with gr.Row():
-                dropdown_sources = gr.CheckboxGroup(
-                    ["IPCC", "IPBES","IPOS"],
-                    label="Select source (by default search in all sources)",
-                    value=["IPCC"],
-                    interactive=True,
-                )
-                dropdown_reports = gr.Dropdown(
-                    POSSIBLE_REPORTS,
-                    label="Or select specific reports",
-                    multiselect=True,
-                    value=None,
-                    interactive=True,
-                )
-                dropdown_external_sources = gr.CheckboxGroup(
-                    ["IPCC figures","OpenAlex", "OurWorldInData"],
-                    label="Select database to search for relevant content",
-                    value=["IPCC figures"],
-                    interactive=True,
-                )
-                search_only = gr.Checkbox(label="Search only for recommended content without chating", value=False, interactive=True, elem_id="checkbox-chat")
-                dropdown_audience = gr.Dropdown(
-                    ["Children","General public","Experts"],
-                    label="Select audience",
-                    value="Experts",
-                    interactive=True,
-                )
-                after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers", visible=False)
-                output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False, visible= False)
-                output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False, visible= False)
-                dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after])
-                close_config_modal = gr.Button("Validate and Close",elem_id="close-config-modal")
-                close_config_modal.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
-                # dropdown_external_sources.change(lambda x: gr.update(visible = True ) if "OpenAlex" in x else gr.update(visible=False) , inputs=[dropdown_external_sources], outputs=[after], visible=True)
-            config_button.click(fn=update_config_modal_visibility, inputs=[config_open], outputs=[config_modal, config_open])
-                    # with gr.Tab("OECD",elem_id = "tab-oecd",id = 6):
-                    #     oecd_indicator = "RIVER_FLOOD_RP100_POP_SH"
-                    #     oecd_topic = "climate"
-                    #     oecd_latitude = "46.8332"
-                    #     oecd_longitude = "5.3725"
-                    #     oecd_zoom = "5.6442"
-                    #     # Create the HTML content with the iframe
-                    #     iframe_html = f"""
-                    #     <iframe src="https://localdataportal.oecd.org/maps.html?indicator={oecd_indicator}&topic={oecd_topic}&latitude={oecd_latitude}&longitude={oecd_longitude}&zoom={oecd_zoom}"
-                    #             width="100%" height="600" frameborder="0" style="border:0;" allowfullscreen></iframe>
-                    #     """
-                    #     oecd_textbox = gr.HTML(iframe_html, show_label=False, elem_id="oecd-textbox")
-#---------------------------------------------------------------------------------------
-# OTHER TABS
-#---------------------------------------------------------------------------------------
-    # with gr.Tab("Settings",elem_id = "tab-config",id = 2):
-    #     gr.Markdown("Reminder: You can talk in any language, ClimateQ&A is multi-lingual!")
-    #     dropdown_sources = gr.CheckboxGroup(
-    #         ["IPCC", "IPBES","IPOS", "OpenAlex"],
-    #         label="Select source",
-    #         value=["IPCC"],
-    #         interactive=True,
-    #     )
-    #     dropdown_reports = gr.Dropdown(
-    #         POSSIBLE_REPORTS,
-    #         label="Or select specific reports",
-    #         multiselect=True,
-    #         value=None,
-    #         interactive=True,
-    #     )
-    #     dropdown_audience = gr.Dropdown(
-    #         ["Children","General public","Experts"],
-    #         label="Select audience",
-    #         value="Experts",
-    #         interactive=True,
-    #     )
-    #     output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
-    #     output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
-    with gr.Tab("About",elem_classes = "max-height other-tabs"):
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown(
-                    """
-                    ### More info
-                    - See more info at [https://climateqa.com](https://climateqa.com/docs/intro/)
-                    - Feedbacks on this [form](https://forms.office.com/e/1Yzgxm6jbp)
-                    ### Citation
-                    """
-                )
-                with gr.Accordion(CITATION_LABEL,elem_id="citation", open = False,):
-                    # # Display citation label and text)
-                    gr.Textbox(
-                        value=CITATION_TEXT,
-                        label="",
-                        interactive=False,
-                        show_copy_button=True,
-                        lines=len(CITATION_TEXT.split('\n')),
-                    )
-    def start_chat(query,history,search_only):
-        history = history + [ChatMessage(role="user", content=query)]
-        if not search_only:
-            return (gr.update(interactive = False),gr.update(selected=1),history)
-        else:
-            return (gr.update(interactive = False),gr.update(selected=2),history)
-    def finish_chat():
-        return gr.update(interactive = True,value = "")
-    # Initialize visibility states
-    summary_visible = False
-    relevant_visible = False
-    # Functions to toggle visibility
-    def toggle_summary_visibility():
-        global summary_visible
-        summary_visible = not summary_visible
-        return gr.update(visible=summary_visible)
-    def toggle_relevant_visibility():
-        global relevant_visible
-        relevant_visible = not relevant_visible
-        return gr.update(visible=relevant_visible)
-    def change_completion_status(current_state):
-        current_state = 1 - current_state
-        return current_state
-    def update_sources_number_display(sources_textbox, figures_cards, current_graphs, papers_html):
-        sources_number = sources_textbox.count("<h2>")
-        figures_number = figures_cards.count("<h2>")
-        graphs_number = current_graphs.count("<iframe")
-        papers_number = papers_html.count("<h2>")
-        sources_notif_label = f"Sources ({sources_number})"
-        figures_notif_label = f"Figures ({figures_number})"
-        graphs_notif_label = f"Graphs ({graphs_number})"
-        papers_notif_label = f"Papers ({papers_number})"
-        recommended_content_notif_label = f"Recommended content ({figures_number + graphs_number + papers_number})"
-        return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
-    (textbox
-        .submit(start_chat, [textbox,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
-        .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
-        # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
-    )
-    (examples_hidden
-        .change(start_chat, [examples_hidden,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
-        .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
-        # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
-    )
-    def change_sample_questions(key):
-        index = list(QUESTIONS.keys()).index(key)
-        visible_bools = [False] * len(samples)
-        visible_bools[index] = True
-        return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
-    sources_raw.change(process_figures, inputs=[sources_raw], outputs=[figures_cards, gallery_component])
-    # update sources numbers
-    sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    figures_cards.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    current_graphs.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    papers_html.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
-    # other questions examples
-    dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
-    # search for papers
-    textbox.submit(find_papers,[textbox,after, dropdown_external_sources], [papers_html,citations_network,papers_summary])
-    examples_hidden.change(find_papers,[examples_hidden,after,dropdown_external_sources], [papers_html,citations_network,papers_summary])
-    # btn_summary.click(toggle_summary_visibility, outputs=summary_popup)
-    # btn_relevant_papers.click(toggle_relevant_visibility, outputs=relevant_popup)
-    demo.queue()
 demo.launch(ssr_mode=False)

+# Import necessary libraries
 import os
+import gradio as gr
 from azure.storage.fileshare import ShareServiceClient
+# Import custom modules
+from climateqa.engine.embeddings import get_embeddings_function
+from climateqa.engine.llm import get_llm
+from climateqa.engine.vectorstore import get_pinecone_vectorstore
+from climateqa.engine.reranker import get_reranker
+from climateqa.engine.graph import make_graph_agent,make_graph_agent_poc
+from climateqa.engine.chains.retrieve_papers import find_papers
+from climateqa.chat import start_chat, chat_stream, finish_chat
+from climateqa.engine.talk_to_data.main import ask_vanna
+from front.tabs import (create_config_modal, create_examples_tab, create_papers_tab, create_figures_tab, create_chat_interface, create_about_tab)
+from front.utils import process_figures
 from gradio_modal import Modal
+from utils import create_user_id
+import logging
+logging.basicConfig(level=logging.WARNING)
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppresses INFO and WARNING logs
+logging.getLogger().setLevel(logging.WARNING)
 # Load environment variables in local mode
 try:
 except Exception as e:
     pass
 # Set up Gradio Theme
 theme = gr.themes.Base(
     font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
 )
+# Azure Blob Storage credentials
 account_key = os.environ["BLOB_ACCOUNT_KEY"]
 if len(account_key) == 86:
     account_key += "=="
 user_id = create_user_id()
 # Create vectorstore and retriever
+embeddings_function = get_embeddings_function()
+vectorstore = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX"))
+vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
+vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_REGION"))
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+if os.environ["GRADIO_ENV"] == "local":
+    reranker = get_reranker("nano")
+else :
+    reranker = get_reranker("large")
+agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0.2)
+agent_poc = make_graph_agent_poc(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0)#TODO put back default 0.2
+async def chat(query, history, audience, sources, reports, relevant_content_sources_selection, search_only):
+    print("chat cqa - message received")
+    async for event in chat_stream(agent, query, history, audience, sources, reports, relevant_content_sources_selection, search_only, share_client, user_id):
+        yield event
+async def chat_poc(query, history, audience, sources, reports, relevant_content_sources_selection, search_only):
+    print("chat poc - message received")
+    async for event in chat_stream(agent_poc, query, history, audience, sources, reports, relevant_content_sources_selection, search_only, share_client, user_id):
+        yield event
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------
+# Function to update modal visibility
+def update_config_modal_visibility(config_open):
+    new_config_visibility_status = not config_open
+    return Modal(visible=new_config_visibility_status), new_config_visibility_status
+def update_sources_number_display(sources_textbox, figures_cards, current_graphs, papers_html):
+    sources_number = sources_textbox.count("<h2>")
+    figures_number = figures_cards.count("<h2>")
+    graphs_number = current_graphs.count("<iframe")
+    papers_number = papers_html.count("<h2>")
+    sources_notif_label = f"Sources ({sources_number})"
+    figures_notif_label = f"Figures ({figures_number})"
+    graphs_notif_label = f"Graphs ({graphs_number})"
+    papers_notif_label = f"Papers ({papers_number})"
+    recommended_content_notif_label = f"Recommended content ({figures_number + graphs_number + papers_number})"
+    return gr.update(label=recommended_content_notif_label), gr.update(label=sources_notif_label), gr.update(label=figures_notif_label), gr.update(label=graphs_notif_label), gr.update(label=papers_notif_label)
+# # UI Layout Components
+def cqa_tab(tab_name):
+    # State variables
     current_graphs = gr.State([])
+    with gr.Tab(tab_name):
         with gr.Row(elem_id="chatbot-row"):
+            # Left column - Chat interface
             with gr.Column(scale=2):
+                chatbot, textbox, config_button = create_chat_interface(tab_name)
+            # Right column - Content panels
+            with gr.Column(scale=2, variant="panel", elem_id="right-panel"):
+                with gr.Tabs(elem_id="right_panel_tab") as tabs:
+                    # Examples tab
+                    with gr.TabItem("Examples", elem_id="tab-examples", id=0):
+                        examples_hidden = create_examples_tab()
+                    # Sources tab
+                    with gr.Tab("Sources", elem_id="tab-sources", id=1) as tab_sources:
+                        sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
+                    # Recommended content tab
+                    with gr.Tab("Recommended content", elem_id="tab-recommended_content", id=2) as tab_recommended_content:
+                        with gr.Tabs(elem_id="group-subtabs") as tabs_recommended_content:
+                            # Figures subtab
+                            with gr.Tab("Figures", elem_id="tab-figures", id=3) as tab_figures:
+                                sources_raw, new_figures, used_figures, gallery_component, figures_cards, figure_modal = create_figures_tab()
+                            # Papers subtab
+                            with gr.Tab("Papers", elem_id="tab-citations", id=4) as tab_papers:
+                                papers_direct_search, papers_summary, papers_html, citations_network, papers_modal = create_papers_tab()
+                            # Graphs subtab
                             with gr.Tab("Graphs", elem_id="tab-graphs", id=5) as tab_graphs:
+                                graphs_container = gr.HTML(
+                                    "<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",
+                                    elem_id="graphs-container"
+                                )
+                            with gr.Tab("DRIAS", elem_id="tab-vanna", id=6) as tab_vanna:
+                                vanna_direct_question = gr.Textbox(label="Direct Question", placeholder="You can write direct question here",elem_id="direct-question", interactive=True)
+                                with gr.Accordion("Details",elem_id = 'vanna-details', open=False) as vanna_details :
+                                    vanna_sql_query = gr.Textbox(label="SQL Query Used", elem_id="sql-query", interactive=False)
+                                    show_vanna_table = gr.Button("Show Table", elem_id="show-table")
+                                    with Modal(visible=False) as vanna_table_modal:
+                                        vanna_table = gr.DataFrame([], elem_id="vanna-table")
+                                        close_vanna_modal = gr.Button("Close", elem_id="close-vanna-modal")
+                                        close_vanna_modal.click(lambda: Modal(visible=False),None, [vanna_table_modal])
+                                    show_vanna_table.click(lambda: Modal(visible=True),None ,[vanna_table_modal])
+                                vanna_display = gr.Plot()
+                                vanna_direct_question.submit(ask_vanna, [vanna_direct_question], [vanna_sql_query ,vanna_table, vanna_display])
+    return {
+        "chatbot": chatbot,
+        "textbox": textbox,
+        "tabs": tabs,
+        "sources_raw": sources_raw,
+        "new_figures": new_figures,
+        "current_graphs": current_graphs,
+        "examples_hidden": examples_hidden,
+        "sources_textbox": sources_textbox,
+        "figures_cards": figures_cards,
+        "gallery_component": gallery_component,
+        "config_button": config_button,
+        "papers_direct_search" : papers_direct_search,
+        "papers_html": papers_html,
+        "citations_network": citations_network,
+        "papers_summary": papers_summary,
+        "tab_recommended_content": tab_recommended_content,
+        "tab_sources": tab_sources,
+        "tab_figures": tab_figures,
+        "tab_graphs": tab_graphs,
+        "tab_papers": tab_papers,
+        "graph_container": graphs_container,
+        "vanna_sql_query": vanna_sql_query,
+        "vanna_table" : vanna_table,
+        "vanna_display": vanna_display
+    }
+def event_handling(
+    main_tab_components,
+    config_components,
+    tab_name="ClimateQ&A"
+):
+    chatbot = main_tab_components["chatbot"]
+    textbox = main_tab_components["textbox"]
+    tabs = main_tab_components["tabs"]
+    sources_raw = main_tab_components["sources_raw"]
+    new_figures = main_tab_components["new_figures"]
+    current_graphs = main_tab_components["current_graphs"]
+    examples_hidden = main_tab_components["examples_hidden"]
+    sources_textbox = main_tab_components["sources_textbox"]
+    figures_cards = main_tab_components["figures_cards"]
+    gallery_component = main_tab_components["gallery_component"]
+    config_button = main_tab_components["config_button"]
+    papers_direct_search = main_tab_components["papers_direct_search"]
+    papers_html = main_tab_components["papers_html"]
+    citations_network = main_tab_components["citations_network"]
+    papers_summary = main_tab_components["papers_summary"]
+    tab_recommended_content = main_tab_components["tab_recommended_content"]
+    tab_sources = main_tab_components["tab_sources"]
+    tab_figures = main_tab_components["tab_figures"]
+    tab_graphs = main_tab_components["tab_graphs"]
+    tab_papers = main_tab_components["tab_papers"]
+    graphs_container = main_tab_components["graph_container"]
+    vanna_sql_query = main_tab_components["vanna_sql_query"]
+    vanna_table = main_tab_components["vanna_table"]
+    vanna_display = main_tab_components["vanna_display"]
+    config_open = config_components["config_open"]
+    config_modal = config_components["config_modal"]
+    dropdown_sources = config_components["dropdown_sources"]
+    dropdown_reports = config_components["dropdown_reports"]
+    dropdown_external_sources = config_components["dropdown_external_sources"]
+    search_only = config_components["search_only"]
+    dropdown_audience = config_components["dropdown_audience"]
+    after = config_components["after"]
+    output_query = config_components["output_query"]
+    output_language = config_components["output_language"]
+    close_config_modal = config_components["close_config_modal_button"]
+    new_sources_hmtl = gr.State([])
+    ttd_data = gr.State([])
+    for button in [config_button, close_config_modal]:
+        button.click(
+            fn=update_config_modal_visibility,
+            inputs=[config_open],
+            outputs=[config_modal, config_open]
+        )
+    if tab_name == "ClimateQ&A":
+        print("chat cqa - message sent")
+        # Event for textbox
+        (textbox
+            .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{textbox.elem_id}")
+            .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name=f"chat_{textbox.elem_id}")
+            .then(finish_chat, None, [textbox], api_name=f"finish_chat_{textbox.elem_id}")
+        )
+        # Event for examples_hidden
+        (examples_hidden
+            .change(start_chat, [examples_hidden, chatbot, search_only], [examples_hidden, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{examples_hidden.elem_id}")
+            .then(chat, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name=f"chat_{examples_hidden.elem_id}")
+            .then(finish_chat, None, [textbox], api_name=f"finish_chat_{examples_hidden.elem_id}")
+        )
+    elif tab_name == "Beta - POC Adapt'Action":
+        print("chat poc - message sent")
+        # Event for textbox
+        (textbox
+            .submit(start_chat, [textbox, chatbot, search_only], [textbox, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{textbox.elem_id}")
+            .then(chat_poc, [textbox, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name=f"chat_{textbox.elem_id}")
+            .then(finish_chat, None, [textbox], api_name=f"finish_chat_{textbox.elem_id}")
+        )
+        # Event for examples_hidden
+        (examples_hidden
+            .change(start_chat, [examples_hidden, chatbot, search_only], [examples_hidden, tabs, chatbot, sources_raw], queue=False, api_name=f"start_chat_{examples_hidden.elem_id}")
+            .then(chat_poc, [examples_hidden, chatbot, dropdown_audience, dropdown_sources, dropdown_reports, dropdown_external_sources, search_only], [chatbot, new_sources_hmtl, output_query, output_language, new_figures, current_graphs], concurrency_limit=8, api_name=f"chat_{examples_hidden.elem_id}")
+            .then(finish_chat, None, [textbox], api_name=f"finish_chat_{examples_hidden.elem_id}")
+        )
+    new_sources_hmtl.change(lambda x : x, inputs = [new_sources_hmtl], outputs = [sources_textbox])
+    current_graphs.change(lambda x: x, inputs=[current_graphs], outputs=[graphs_container])
+    new_figures.change(process_figures, inputs=[sources_raw, new_figures], outputs=[sources_raw, figures_cards, gallery_component])
+    # Update sources numbers
+    for component in [sources_textbox, figures_cards, current_graphs, papers_html]:
+        component.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs, papers_html], [tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
+    # Search for papers
+    for component in [textbox, examples_hidden, papers_direct_search]:
+        component.submit(find_papers, [component, after, dropdown_external_sources], [papers_html, citations_network, papers_summary])
+    if tab_name == "Beta - POC Adapt'Action":
+        # Drias search
+        textbox.submit(ask_vanna, [textbox], [vanna_sql_query ,vanna_table, vanna_display])
+def main_ui():
+    # config_open = gr.State(True)
+    with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=theme, elem_id="main-component") as demo:
+        config_components = create_config_modal()
+        with gr.Tabs():
+            cqa_components = cqa_tab(tab_name = "ClimateQ&A")
+            local_cqa_components = cqa_tab(tab_name = "Beta - POC Adapt'Action")
+            create_about_tab()
+        event_handling(cqa_components, config_components, tab_name = 'ClimateQ&A')
+        event_handling(local_cqa_components, config_components, tab_name = "Beta - POC Adapt'Action")
+        demo.queue()
+    return demo
+demo = main_ui()
 demo.launch(ssr_mode=False)

climateqa/chat.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+from datetime import datetime
+import gradio as gr
+# from .agent import agent
+from gradio import ChatMessage
+from langgraph.graph.state import CompiledStateGraph
+import json
+from .handle_stream_events import (
+    init_audience,
+    handle_retrieved_documents,
+    convert_to_docs_to_html,
+    stream_answer,
+    handle_retrieved_owid_graphs,
+    serialize_docs,
+)
+# Function to log data on Azure
+def log_on_azure(file, logs, share_client):
+    logs = json.dumps(logs)
+    file_client = share_client.get_file_client(file)
+    file_client.upload_file(logs)
+# Chat functions
+def start_chat(query, history, search_only):
+    history = history + [ChatMessage(role="user", content=query)]
+    if not search_only:
+        return (gr.update(interactive=False), gr.update(selected=1), history, [])
+    else:
+        return (gr.update(interactive=False), gr.update(selected=2), history, [])
+def finish_chat():
+    return gr.update(interactive=True, value="")
+def log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id):
+    try:
+        # Log interaction to Azure if not in local environment
+        if os.getenv("GRADIO_ENV") != "local":
+            timestamp = str(datetime.now().timestamp())
+            prompt = history[1]["content"]
+            logs = {
+                "user_id": str(user_id),
+                "prompt": prompt,
+                "query": prompt,
+                "question": output_query,
+                "sources": sources,
+                "docs": serialize_docs(docs),
+                "answer": history[-1].content,
+                "time": timestamp,
+            }
+            log_on_azure(f"{timestamp}.json", logs, share_client)
+    except Exception as e:
+        print(f"Error logging on Azure Blob Storage: {e}")
+        error_msg = f"ClimateQ&A Error: {str(e)[:100]} - The error has been noted, try another question and if the error remains, you can contact us :)"
+        raise gr.Error(error_msg)
+def handle_numerical_data(event):
+    if event["name"] == "retrieve_drias_data" and event["event"] == "on_chain_end":
+        numerical_data = event["data"]["output"]["drias_data"]
+        sql_query = event["data"]["output"]["drias_sql_query"]
+        return numerical_data, sql_query
+    return None, None
+# Main chat function
+async def chat_stream(
+    agent : CompiledStateGraph,
+    query: str,
+    history: list[ChatMessage],
+    audience: str,
+    sources: list[str],
+    reports: list[str],
+    relevant_content_sources_selection: list[str],
+    search_only: bool,
+    share_client,
+    user_id: str
+) -> tuple[list, str, str, str, list, str]:
+    """Process a chat query and return response with relevant sources and visualizations.
+    Args:
+        query (str): The user's question
+        history (list): Chat message history
+        audience (str): Target audience type
+        sources (list): Knowledge base sources to search
+        reports (list): Specific reports to search within sources
+        relevant_content_sources_selection (list): Types of content to retrieve (figures, papers, etc)
+        search_only (bool): Whether to only search without generating answer
+    Yields:
+        tuple: Contains:
+            - history: Updated chat history
+            - docs_html: HTML of retrieved documents
+            - output_query: Processed query
+            - output_language: Detected language
+            - related_contents: Related content
+            - graphs_html: HTML of relevant graphs
+    """
+    # Log incoming question
+    date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(f">> NEW QUESTION ({date_now}) : {query}")
+    audience_prompt = init_audience(audience)
+    sources = sources or ["IPCC", "IPBES"]
+    reports = reports or []
+    # Prepare inputs for agent
+    inputs = {
+        "user_input": query,
+        "audience": audience_prompt,
+        "sources_input": sources,
+        "relevant_content_sources_selection": relevant_content_sources_selection,
+        "search_only": search_only,
+        "reports": reports
+    }
+    # Get streaming events from agent
+    result = agent.astream_events(inputs, version="v1")
+    # Initialize state variables
+    docs = []
+    related_contents = []
+    docs_html = ""
+    new_docs_html = ""
+    output_query = ""
+    output_language = ""
+    output_keywords = ""
+    start_streaming = False
+    graphs_html = ""
+    used_documents = []
+    retrieved_contents = []
+    answer_message_content = ""
+    vanna_data = {}
+    # Define processing steps
+    steps_display = {
+        "categorize_intent": ("🔄️ Analyzing user message", True),
+        "transform_query": ("🔄️ Thinking step by step to answer the question", True),
+        "retrieve_documents": ("🔄️ Searching in the knowledge base", False),
+        "retrieve_local_data": ("🔄️ Searching in the knowledge base", False),
+    }
+    try:
+        # Process streaming events
+        async for event in result:
+            if "langgraph_node" in event["metadata"]:
+                node = event["metadata"]["langgraph_node"]
+                # Handle document retrieval
+                if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
+                    history, used_documents, retrieved_contents = handle_retrieved_documents(
+                        event, history, used_documents, retrieved_contents
+                    )
+                # Handle Vanna retrieval
+                # if event["event"] == "on_chain_end" and event["name"] in ["retrieve_documents","retrieve_local_data"] and event["data"]["output"] != None:
+                #     df_output_vanna, sql_query = handle_numerical_data(
+                #         event
+                #     )
+                #     vanna_data = {"df_output": df_output_vanna, "sql_query": sql_query}
+                if event["event"] == "on_chain_end" and event["name"] == "answer_search" :
+                    docs = event["data"]["input"]["documents"]
+                    docs_html = convert_to_docs_to_html(docs)
+                    related_contents = event["data"]["input"]["related_contents"]
+                # Handle intent categorization
+                elif (event["event"] == "on_chain_end" and
+                      node == "categorize_intent" and
+                      event["name"] == "_write"):
+                    intent = event["data"]["output"]["intent"]
+                    output_language = event["data"]["output"].get("language", "English")
+                    history[-1].content = f"Language identified: {output_language}\nIntent identified: {intent}"
+                # Handle processing steps display
+                elif event["name"] in steps_display and event["event"] == "on_chain_start":
+                    event_description, display_output = steps_display[node]
+                    if (not hasattr(history[-1], 'metadata') or
+                        history[-1].metadata["title"] != event_description):
+                        history.append(ChatMessage(
+                            role="assistant",
+                            content="",
+                            metadata={'title': event_description}
+                        ))
+                # Handle answer streaming
+                elif (event["name"] != "transform_query" and
+                      event["event"] == "on_chat_model_stream" and
+                      node in ["answer_rag","answer_rag_no_docs", "answer_search", "answer_chitchat"]):
+                    history, start_streaming, answer_message_content = stream_answer(
+                        history, event, start_streaming, answer_message_content
+                    )
+                # Handle graph retrieval
+                elif event["name"] in ["retrieve_graphs", "retrieve_graphs_ai"] and event["event"] == "on_chain_end":
+                    graphs_html = handle_retrieved_owid_graphs(event, graphs_html)
+                # Handle query transformation
+                if event["name"] == "transform_query" and event["event"] == "on_chain_end":
+                    if hasattr(history[-1], "content"):
+                        sub_questions = [q["question"] + "-> relevant sources : " + str(q["sources"]) for q in event["data"]["output"]["questions_list"]]
+                        history[-1].content += "Decompose question into sub-questions:\n\n - " + "\n - ".join(sub_questions)
+            yield history, docs_html, output_query, output_language, related_contents, graphs_html#, vanna_data
+    except Exception as e:
+        print(f"Event {event} has failed")
+        raise gr.Error(str(e))
+    # Call the function to log interaction
+    log_interaction_to_azure(history, output_query, sources, docs, share_client, user_id)
+    yield history, docs_html, output_query, output_language, related_contents, graphs_html#, vanna_data

climateqa/constants.py CHANGED Viewed

@@ -1,4 +1,6 @@
 POSSIBLE_REPORTS = [
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",

 POSSIBLE_REPORTS = [
+    "IPBES IABWFH SPM",
+    "IPBES CBL SPM",
     "IPCC AR6 WGI SPM",
     "IPCC AR6 WGI FR",
     "IPCC AR6 WGI TS",

climateqa/engine/chains/answer_rag.py CHANGED Viewed

@@ -11,7 +11,7 @@ import time
 from ..utils import rename_chain, pass_values
-DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
@@ -61,10 +61,11 @@ def make_rag_node(llm,with_docs = True):
         rag_chain = make_rag_chain(llm)
     else:
         rag_chain = make_rag_chain_without_docs(llm)
     async def answer_rag(state,config):
         print("---- Answer RAG ----")
         start_time = time.time()
         answer = await rag_chain.ainvoke(state,config)

 from ..utils import rename_chain, pass_values
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="Source : {source} - Content : {page_content}")
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
         rag_chain = make_rag_chain(llm)
     else:
         rag_chain = make_rag_chain_without_docs(llm)
     async def answer_rag(state,config):
         print("---- Answer RAG ----")
         start_time = time.time()
+        print("Sources used : " +  "\n".join([x.metadata["short_name"] + " - page " + str(x.metadata["page_number"])  for x in state["documents"]]))
         answer = await rag_chain.ainvoke(state,config)

climateqa/engine/chains/drias_retriever.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import sys
+import os
+from climateqa.engine.talk_to_data.main import ask_vanna
+def make_drias_retriever_node(llm):
+    def retrieve_drias_data(state):
+        print("---- Retrieving data from DRIAS ----")
+        query = state["query"]
+        sql_query, df, fig = ask_vanna(query)
+        state["drias_data"] = df
+        state["drias_sql_query"] = sql_query
+        return state
+    return retrieve_drias_data

climateqa/engine/chains/graph_retriever.py CHANGED Viewed

@@ -50,7 +50,9 @@ def make_graph_retriever_node(vectorstore, reranker, rerank_by_question=True, k_
         print("---- Retrieving graphs ----")
         POSSIBLE_SOURCES = ["IEA", "OWID"]
-        questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
         # sources_input = state["sources_input"]
         sources_input = ["auto"]

         print("---- Retrieving graphs ----")
         POSSIBLE_SOURCES = ["IEA", "OWID"]
+        # questions = state["remaining_questions"] if state["remaining_questions"] is not None and state["remaining_questions"]!=[]  else [state["query"]]
+        questions = state["questions_list"] if state["questions_list"] is not None and state["questions_list"]!=[]  else [state["query"]]
         # sources_input = state["sources_input"]
         sources_input = ["auto"]

climateqa/engine/chains/intent_categorization.py CHANGED Viewed

@@ -29,7 +29,7 @@ class IntentCategorizer(BaseModel):
             Examples:
             - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
             - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
-            - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
         """,
             # - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
             # - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
@@ -57,6 +57,7 @@ def make_intent_categorization_node(llm):
     categorization_chain = make_intent_categorization_chain(llm)
     def categorize_message(state):
         print("---- Categorize_message ----")
         output = categorization_chain.invoke({"input": state["user_input"]})

             Examples:
             - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
             - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
+            - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant. If it can be interprated as a climate related question, please use the search intent.
         """,
             # - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
             # - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
     categorization_chain = make_intent_categorization_chain(llm)
     def categorize_message(state):
+        print("Input Message : ", state["user_input"])
         print("---- Categorize_message ----")
         output = categorization_chain.invoke({"input": state["user_input"]})

climateqa/engine/chains/prompts.py CHANGED Viewed

@@ -36,13 +36,41 @@ You are given a question and extracted passages of the IPCC and/or IPBES reports
 """
 answer_prompt_template = """
-You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.
 - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
-- Do not use the sentence 'Doc i says ...' to say where information came from.
 - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
@@ -51,6 +79,7 @@ Guidelines:
 - Consider by default that the question is about the past century unless it is specified otherwise.
 - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
 -----------------------
 Passages:
 {context}
@@ -60,7 +89,6 @@ Question: {query} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 papers_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted abstracts of scientific papers. Provide a clear and structured answer based on the abstracts provided, the context and the guidelines.

 """
+# answer_prompt_template_old = """
+# You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
+# Guidelines:
+# - If the passages have useful facts or numbers, use them in your answer.
+# - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
+# - Do not use the sentence 'Doc i says ...' to say where information came from.
+# - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
+# - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+# - If it makes sense, use bullet points and lists to make your answers easier to understand.
+# - You do not need to use every passage. Only use the ones that help answer the question.
+# - If the documents do not have the information needed to answer the question, just say you do not have enough information.
+# - Consider by default that the question is about the past century unless it is specified otherwise.
+# - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
+# -----------------------
+# Passages:
+# {context}
+# -----------------------
+# Question: {query} - Explained to {audience}
+# Answer in {language} with the passages citations:
+# """
 answer_prompt_template = """
+You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.
 - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
+- You will receive passages from different reports, e.g., IPCC and PPCP. Make separate paragraphs and specify the source of the information in your answer, e.g., "According to IPCC, ...".
+- The different sources are IPCC, IPBES, PPCP (for Plan Climat Air Energie Territorial de Paris), PBDP (for Plan Biodiversité de Paris), Acclimaterra (Rapport scientifique de la région Nouvelle Aquitaine en France).
+- If the reports are local (like PPCP, PBDP, Acclimaterra), consider that the information is specific to the region and not global. If the document is about a nearby region (for example, an extract from Acclimaterra for a question about Britain), explicitly state the concerned region.
+- Do not mention that you are using specific extract documents, but mention only the source information. "According to IPCC, ..." rather than "According to the provided document from IPCC ..."
+- Make a clear distinction between information from IPCC, IPBES, Acclimaterra that are scientific reports and PPCP, PBDP that are strategic reports. Strategic reports should not be taken as verified facts, but as political or strategic decisions.
 - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
 - Consider by default that the question is about the past century unless it is specified otherwise.
 - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
 -----------------------
 Passages:
 {context}
 Answer in {language} with the passages citations:
 """
 papers_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted abstracts of scientific papers. Provide a clear and structured answer based on the abstracts provided, the context and the guidelines.

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -7,43 +7,7 @@ from langchain.prompts import ChatPromptTemplate
 from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-ROUTING_INDEX = {
-    "Vector":["IPCC","IPBES","IPOS"],
-    "OpenAlex":["OpenAlex"],
-}
-POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
-# Prompt from the original paper https://arxiv.org/pdf/2305.14283
-# Query Rewriting for Retrieval-Augmented Large Language Models
-class QueryDecomposition(BaseModel):
-    """
-    Decompose the user query into smaller parts to think step by step to answer this question
-    Act as a simple planning agent
-    """
-    questions: List[str] = Field(
-        description="""
-        Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
-        Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
-        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
-        - If you need to decompose the question, output a list of maximum 2 to 3 questions
-    """
-    )
-class Location(BaseModel):
-    country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
-    location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
-class QueryAnalysis(BaseModel):
-    """
-    Analyzing the user query to extract topics, sources and date
-    Also do query expansion to get alternative search queries
-    Also provide simple keywords to feed a search engine
-    """
     # keywords: List[str] = Field(
     #     description="""
     #     Extract the keywords from the user query to feed a search engine as a list
@@ -68,17 +32,10 @@ class QueryAnalysis(BaseModel):
     #     This questions should help you get more context and information about the user query
     #     """
     # )
-    sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field( #,"OpenAlex"]] = Field(
-        ...,
-        description="""
-            Given a user question choose which documents would be most relevant for answering their question,
-            - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
-            - IPBES is for questions about biodiversity and nature
-            - IPOS is for questions about the ocean and deep sea mining
-        """,
-            # - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
-    )
     # topics: List[Literal[
     #     "Climate change",
     #     "Biodiversity",
@@ -101,7 +58,82 @@ class QueryAnalysis(BaseModel):
     # location:Location
 def make_query_decomposition_chain(llm):
     openai_functions = [convert_to_openai_function(QueryDecomposition)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
@@ -115,7 +147,8 @@ def make_query_decomposition_chain(llm):
     return chain
-def make_query_rewriter_chain(llm):
     openai_functions = [convert_to_openai_function(QueryAnalysis)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
@@ -123,7 +156,7 @@ def make_query_rewriter_chain(llm):
     prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
         ("user", "input: {input}")
     ])
@@ -132,22 +165,63 @@ def make_query_rewriter_chain(llm):
     return chain
 def make_query_transform_node(llm,k_final=15):
     decomposition_chain = make_query_decomposition_chain(llm)
-    rewriter_chain = make_query_rewriter_chain(llm)
     def transform_query(state):
         print("---- Transform query ----")
-        if "sources_auto" not in state or state["sources_auto"] is None or state["sources_auto"] is False:
-            auto_mode = False
-        else:
-            auto_mode = True
-        sources_input = state.get("sources_input")
-        if sources_input is None: sources_input = ROUTING_INDEX["Vector"]
         new_state = {}
@@ -155,24 +229,41 @@ def make_query_transform_node(llm,k_final=15):
         decomposition_output = decomposition_chain.invoke({"input":state["query"]})
         new_state.update(decomposition_output)
         # Query Analysis
         questions = []
         for question in new_state["questions"]:
             question_state = {"question":question}
-            analysis_output = rewriter_chain.invoke({"input":question})
             # TODO WARNING llm should always return smthg
-            # The case when the llm does not return any sources
-            if not analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS"] for source in analysis_output["sources"]):
-                analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
-            question_state.update(analysis_output)
-            questions.append(question_state)
         # Explode the questions into multiple questions with different sources
         new_questions = []
         for q in questions:
-            question,sources = q["question"],q["sources"]
             # If not auto mode we take the configuration
             if not auto_mode:
@@ -181,7 +272,7 @@ def make_query_transform_node(llm,k_final=15):
             for index,index_sources in ROUTING_INDEX.items():
                 selected_sources = list(set(sources).intersection(index_sources))
                 if len(selected_sources) > 0:
-                    new_questions.append({"question":question,"sources":selected_sources,"index":index})
         # # Add the number of questions to search
         # k_by_question = k_final // len(new_questions)
@@ -191,11 +282,19 @@ def make_query_transform_node(llm,k_final=15):
         # new_state["questions"] = new_questions
         # new_state["remaining_questions"] = new_questions
         new_state = {
-            "remaining_questions":new_questions,
-            "n_questions":len(new_questions),
         }
         return new_state
     return transform_query

 from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+# OLD QUERY ANALYSIS
     # keywords: List[str] = Field(
     #     description="""
     #     Extract the keywords from the user query to feed a search engine as a list
     #     This questions should help you get more context and information about the user query
     #     """
     # )
+    # - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
+    #
     # topics: List[Literal[
     #     "Climate change",
     #     "Biodiversity",
     # location:Location
+ROUTING_INDEX = {
+    "IPx":["IPCC", "IPBES", "IPOS"],
+    "POC": ["AcclimaTerra", "PCAET","Biodiv"],
+    "OpenAlex":["OpenAlex"],
+}
+POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
+# Prompt from the original paper https://arxiv.org/pdf/2305.14283
+# Query Rewriting for Retrieval-Augmented Large Language Models
+class QueryDecomposition(BaseModel):
+    """
+    Decompose the user query into smaller parts to think step by step to answer this question
+    Act as a simple planning agent
+    """
+    questions: List[str] = Field(
+        description="""
+        Think step by step to answer this question, and provide one or several search engine questions in the provided language for knowledge that you need.
+        Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
+        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
+        - If you need to decompose the question, output a list of maximum 2 to 3 questions
+    """
+    )
+class Location(BaseModel):
+    country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
+    location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
+class QueryTranslation(BaseModel):
+    """Translate the query into a given language"""
+    question : str = Field(
+        description="""
+        Translate the questions into the given language
+        If the question is alrealdy in the given language, just return the same question
+        """,
+    )
+class QueryAnalysis(BaseModel):
+    """
+    Analyze the user query to extract the relevant sources
+    Deprecated:
+    Analyzing the user query to extract topics, sources and date
+    Also do query expansion to get alternative search queries
+    Also provide simple keywords to feed a search engine
+    """
+    sources: List[Literal["IPCC", "IPBES", "IPOS", "AcclimaTerra", "PCAET","Biodiv"]] = Field( #,"OpenAlex"]] = Field(
+        ...,
+        description="""
+            Given a user question choose which documents would be most relevant for answering their question,
+            - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
+            - IPBES is for questions about biodiversity and nature
+            - IPOS is for questions about the ocean and deep sea mining
+            - AcclimaTerra is for questions about any specific place in, or close to, the french region "Nouvelle-Aquitaine"
+            - PCAET is the Plan Climat Eneregie Territorial for the city of Paris
+            - Biodiv is the Biodiversity plan for the city of Paris
+        """,
+    )
 def make_query_decomposition_chain(llm):
+    """Chain to decompose a query into smaller parts to think step by step to answer this question
+    Args:
+        llm (_type_): _description_
+    Returns:
+        _type_: _description_
+    """
     openai_functions = [convert_to_openai_function(QueryDecomposition)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
     return chain
+def make_query_analysis_chain(llm):
+    """Analyze the user query to extract the relevant sources"""
     openai_functions = [convert_to_openai_function(QueryAnalysis)]
     llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
     prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze the user input message using the function provided"),
         ("user", "input: {input}")
     ])
     return chain
+def make_query_translation_chain(llm):
+    """Analyze the user query to extract the relevant sources"""
+    openai_functions = [convert_to_openai_function(QueryTranslation)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryTranslation"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, translate the question into {language}"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def group_by_sources_types(sources):
+    sources_types = {}
+    IPx_sources = ["IPCC", "IPBES", "IPOS"]
+    local_sources = ["AcclimaTerra", "PCAET","Biodiv"]
+    if any(source in IPx_sources for source in sources):
+        sources_types["IPx"] = list(set(sources).intersection(IPx_sources))
+    if any(source in local_sources for source in sources):
+        sources_types["POC"] = list(set(sources).intersection(local_sources))
+    return sources_types
 def make_query_transform_node(llm,k_final=15):
+    """
+    Creates a query transformation node that processes and transforms a given query state.
+    Args:
+        llm: The language model to be used for query decomposition and rewriting.
+        k_final (int, optional): The final number of questions to be generated. Defaults to 15.
+    Returns:
+        function: A function that takes a query state and returns a transformed state.
+    The returned function performs the following steps:
+        1. Checks if the query should be processed in auto mode based on the state.
+        2. Retrieves the input sources from the state or defaults to a predefined routing index.
+        3. Decomposes the query using the decomposition chain.
+        4. Analyzes each decomposed question using the rewriter chain.
+        5. Ensures that the sources returned by the language model are valid.
+        6. Explodes the questions into multiple questions with different sources based on the mode.
+        7. Constructs a new state with the transformed questions and their respective sources.
+    """
     decomposition_chain = make_query_decomposition_chain(llm)
+    query_analysis_chain = make_query_analysis_chain(llm)
+    query_translation_chain = make_query_translation_chain(llm)
     def transform_query(state):
         print("---- Transform query ----")
+        auto_mode = state.get("sources_auto", True)
+        sources_input = state.get("sources_input", ROUTING_INDEX["IPx"])
         new_state = {}
         decomposition_output = decomposition_chain.invoke({"input":state["query"]})
         new_state.update(decomposition_output)
         # Query Analysis
         questions = []
         for question in new_state["questions"]:
             question_state = {"question":question}
+            query_analysis_output = query_analysis_chain.invoke({"input":question})
             # TODO WARNING llm should always return smthg
+            # The case when the llm does not return any sources or wrong ouput
+            if not query_analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS","AcclimaTerra", "PCAET","Biodiv"] for source in query_analysis_output["sources"]):
+                query_analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
+            sources_types = group_by_sources_types(query_analysis_output["sources"])
+            for source_type,sources in sources_types.items():
+                question_state = {
+                    "question":question,
+                    "sources":sources,
+                    "source_type":source_type
+                }
+                questions.append(question_state)
+        # Translate question into the document language
+        for q in questions:
+            if q["source_type"]=="IPx":
+                translation_output = query_translation_chain.invoke({"input":q["question"],"language":"English"})
+                q["question"] = translation_output["question"]
+            elif q["source_type"]=="POC":
+                translation_output = query_translation_chain.invoke({"input":q["question"],"language":"French"})
+                q["question"] = translation_output["question"]
         # Explode the questions into multiple questions with different sources
         new_questions = []
         for q in questions:
+            question,sources,source_type = q["question"],q["sources"], q["source_type"]
             # If not auto mode we take the configuration
             if not auto_mode:
             for index,index_sources in ROUTING_INDEX.items():
                 selected_sources = list(set(sources).intersection(index_sources))
                 if len(selected_sources) > 0:
+                    new_questions.append({"question":question,"sources":selected_sources,"index":index, "source_type":source_type})
         # # Add the number of questions to search
         # k_by_question = k_final // len(new_questions)
         # new_state["questions"] = new_questions
         # new_state["remaining_questions"] = new_questions
+        n_questions = {
+            "total":len(new_questions),
+            "IPx":len([q for q in new_questions if q["index"] == "IPx"]),
+            "POC":len([q for q in new_questions if q["index"] == "POC"]),
+        }
         new_state = {
+            "questions_list":new_questions,
+            "n_questions":n_questions,
+            "handled_questions_index":[],
         }
+        print("New questions")
+        print(new_questions)
         return new_state
     return transform_query

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -7,7 +7,7 @@ from langchain_core.runnables import chain
 from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 from langchain_core.runnables import RunnableLambda
-from ..reranker import rerank_docs
 # from ...knowledge.retriever import ClimateQARetriever
 from ...knowledge.openalex import OpenAlexRetriever
 from .keywords_extraction import make_keywords_extraction_chain
@@ -15,7 +15,9 @@ from ..utils import log_event
 from langchain_core.vectorstores import VectorStore
 from typing import List
 from langchain_core.documents.base import Document
 def divide_into_parts(target, parts):
@@ -87,7 +89,7 @@ def _get_k_images_by_question(n_questions):
     elif n_questions == 2:
         return 5
     elif n_questions == 3:
-        return 2
     else:
         return 1
@@ -98,11 +100,77 @@ def _add_metadata_and_score(docs: List) -> Document:
         doc.page_content = doc.page_content.replace("\r\n"," ")
         doc.metadata["similarity_score"] = score
         doc.metadata["content"] = doc.page_content
-        doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
         docs_with_metadata.append(doc)
     return docs_with_metadata
 async def get_IPCC_relevant_documents(
     query: str,
     vectorstore:VectorStore,
@@ -164,8 +232,7 @@ async def get_IPCC_relevant_documents(
             "chunk_type":"text",
             "report_type": { "$nin":["SPM"]},
         }
-        k_full = k_total - len(docs_summaries)
-        docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
         if search_figures:
             # Images
@@ -188,15 +255,45 @@ async def get_IPCC_relevant_documents(
     }
 # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
 # @chain
-async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5, k_images=5):
     """
-    Retrieve and rerank documents based on the current question in the state.
     Args:
         state (dict): The current state containing documents, related content, relevant content sources, remaining questions and n_questions.
         config (dict): Configuration settings for logging and other purposes.
         vectorstore (object): The vector store used to retrieve relevant documents.
         reranker (object): The reranker used to rerank the retrieved documents.
@@ -209,95 +306,194 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
     Returns:
         dict: The updated state containing the retrieved and reranked documents, related content, and remaining questions.
     """
-    print("---- Retrieve documents ----")
-    # Get the documents from the state
-    if "documents" in state and state["documents"] is not None:
-        docs = state["documents"]
-    else:
-        docs = []
-    # Get the related_content from the state
-    if "related_content" in state and state["related_content"] is not None:
-        related_content = state["related_content"]
-    else:
-        related_content = []
-    search_figures = "IPCC figures" in state["relevant_content_sources"]
-    search_only = state["search_only"]
-    # Get the current question
-    current_question = state["remaining_questions"][0]
-    remaining_questions = state["remaining_questions"][1:]
-    k_by_question = k_final // state["n_questions"]
-    k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
-    k_images_by_question = _get_k_images_by_question(state["n_questions"])
     sources = current_question["sources"]
     question = current_question["question"]
     index = current_question["index"]
     print(f"Retrieve documents for question: {question}")
     await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
-    if index == "Vector": # always true for now
         docs_question_dict = await get_IPCC_relevant_documents(
             query  = question,
             vectorstore=vectorstore,
             search_figures = search_figures,
             sources = sources,
             min_size = 200,
-            k_summary = k_summary_by_question,
             k_total = k_before_reranking,
             k_images = k_images_by_question,
             threshold = 0.5,
             search_only = search_only,
         )
     # Rerank
-    if reranker is not None:
         with suppress_output():
-            docs_question_summary_reranked = rerank_docs(reranker,docs_question_dict["docs_summaries"],question)
-            docs_question_fulltext_reranked = rerank_docs(reranker,docs_question_dict["docs_full"],question)
-            docs_question_images_reranked = rerank_docs(reranker,docs_question_dict["docs_images"],question)
-            if rerank_by_question:
-                docs_question_summary_reranked = sorted(docs_question_summary_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
-                docs_question_fulltext_reranked = sorted(docs_question_fulltext_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
-                docs_question_images_reranked = sorted(docs_question_images_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
     else:
-        docs_question = docs_question_dict["docs_summaries"] + docs_question_dict["docs_full"]
         # Add a default reranking score
         for doc in docs_question:
             doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-    docs_question = docs_question_summary_reranked + docs_question_fulltext_reranked
-    docs_question = docs_question[:k_by_question]
-    images_question = docs_question_images_reranked[:k_images]
     if reranker is not None and rerank_by_question:
-        docs_question = sorted(docs_question, key=lambda x: x.metadata["reranking_score"], reverse=True)
     # Add sources used in the metadata
     docs_question = _add_sources_used_in_metadata(docs_question,sources,question,index)
     images_question = _add_sources_used_in_metadata(images_question,sources,question,index)
-    # Add to the list of docs
-    docs.extend(docs_question)
-    related_content.extend(images_question)
-    new_state = {"documents":docs, "related_contents": related_content,"remaining_questions":remaining_questions}
     return new_state
-def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    @chain
-    async def retrieve_docs(state, config):
-        state =  await retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question, k_final, k_before_reranking, k_summary)
         return state
-    return retrieve_docs

 from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 from langchain_core.runnables import RunnableLambda
+from ..reranker import rerank_docs, rerank_and_sort_docs
 # from ...knowledge.retriever import ClimateQARetriever
 from ...knowledge.openalex import OpenAlexRetriever
 from .keywords_extraction import make_keywords_extraction_chain
 from langchain_core.vectorstores import VectorStore
 from typing import List
 from langchain_core.documents.base import Document
+import asyncio
+from typing import Any, Dict, List, Tuple
 def divide_into_parts(target, parts):
     elif n_questions == 2:
         return 5
     elif n_questions == 3:
+        return 3
     else:
         return 1
         doc.page_content = doc.page_content.replace("\r\n"," ")
         doc.metadata["similarity_score"] = score
         doc.metadata["content"] = doc.page_content
+        if doc.metadata["page_number"] != "N/A":
+            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+        else:
+            doc.metadata["page_number"] = 1
         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
         docs_with_metadata.append(doc)
     return docs_with_metadata
+def remove_duplicates_chunks(docs):
+    # Remove duplicates or almost duplicates
+    docs = sorted(docs,key=lambda x: x[1],reverse=True)
+    seen = set()
+    result = []
+    for doc in docs:
+        if doc[0].page_content not in seen:
+            seen.add(doc[0].page_content)
+            result.append(doc)
+    return result
+async def get_POC_relevant_documents(
+    query: str,
+    vectorstore:VectorStore,
+    sources:list = ["Acclimaterra","PCAET","Plan Biodiversite"],
+    search_figures:bool = False,
+    search_only:bool = False,
+    k_documents:int = 10,
+    threshold:float = 0.6,
+    k_images: int = 5,
+    reports:list = [],
+    min_size:int = 200,
+) :
+    # Prepare base search kwargs
+    filters = {}
+    docs_question = []
+    docs_images = []
+    # TODO add source selection
+    # if len(reports) > 0:
+    #     filters["short_name"] = {"$in":reports}
+    # else:
+    #     filters["source"] = { "$in": sources}
+    filters_text = {
+        **filters,
+        "chunk_type":"text",
+        # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
+    }
+    docs_question = vectorstore.similarity_search_with_score(query=query,filter = filters_text,k = k_documents)
+    # remove duplicates or almost duplicates
+    docs_question = remove_duplicates_chunks(docs_question)
+    docs_question = [x for x in docs_question if x[1] > threshold]
+    if search_figures:
+        # Images
+        filters_image = {
+            **filters,
+            "chunk_type":"image"
+        }
+        docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
+    docs_question, docs_images = _add_metadata_and_score(docs_question), _add_metadata_and_score(docs_images)
+    docs_question = [x for x in docs_question if len(x.page_content) > min_size]
+    return {
+        "docs_question" : docs_question,
+        "docs_images" : docs_images
+    }
 async def get_IPCC_relevant_documents(
     query: str,
     vectorstore:VectorStore,
             "chunk_type":"text",
             "report_type": { "$nin":["SPM"]},
         }
+        docs_full = vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_total)
         if search_figures:
             # Images
     }
+def concatenate_documents(index, source_type, docs_question_dict, k_by_question, k_summary_by_question, k_images_by_question):
+    # Keep the right number of documents - The k_summary documents from SPM are placed in front
+    if source_type == "IPx":
+        docs_question = docs_question_dict["docs_summaries"][:k_summary_by_question] + docs_question_dict["docs_full"][:(k_by_question - k_summary_by_question)]
+    elif source_type == "POC" :
+        docs_question = docs_question_dict["docs_question"][:k_by_question]
+    else :
+        raise ValueError("source_type should be either Vector or POC")
+        # docs_question = [doc for key in docs_question_dict.keys() for doc in docs_question_dict[key]][:(k_by_question)]
+    images_question = docs_question_dict["docs_images"][:k_images_by_question]
+    return docs_question, images_question
 # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
 # @chain
+async def retrieve_documents(
+    current_question: Dict[str, Any],
+    config: Dict[str, Any],
+    source_type: str,
+    vectorstore: VectorStore,
+    reranker: Any,
+    search_figures: bool = False,
+    search_only: bool = False,
+    reports: list = [],
+    rerank_by_question: bool = True,
+    k_images_by_question: int = 5,
+    k_before_reranking: int = 100,
+    k_by_question: int = 5,
+    k_summary_by_question: int = 3
+) -> Tuple[List[Document], List[Document]]:
     """
+    Unpack the first question of the remaining questions, and retrieve and rerank corresponding documents, based on the question and selected_sources
     Args:
         state (dict): The current state containing documents, related content, relevant content sources, remaining questions and n_questions.
+        current_question (dict): The current question being processed.
         config (dict): Configuration settings for logging and other purposes.
         vectorstore (object): The vector store used to retrieve relevant documents.
         reranker (object): The reranker used to rerank the retrieved documents.
     Returns:
         dict: The updated state containing the retrieved and reranked documents, related content, and remaining questions.
     """
     sources = current_question["sources"]
     question = current_question["question"]
     index = current_question["index"]
+    source_type = current_question["source_type"]
     print(f"Retrieve documents for question: {question}")
     await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
+    print(f"""---- Retrieve documents from {current_question["source_type"]}----""")
+    if source_type == "IPx":
         docs_question_dict = await get_IPCC_relevant_documents(
             query  = question,
             vectorstore=vectorstore,
             search_figures = search_figures,
             sources = sources,
             min_size = 200,
+            k_summary = k_before_reranking-1,
             k_total = k_before_reranking,
             k_images = k_images_by_question,
             threshold = 0.5,
             search_only = search_only,
+            reports = reports,
         )
+    if source_type == "POC":
+        docs_question_dict = await get_POC_relevant_documents(
+            query = question,
+            vectorstore=vectorstore,
+            search_figures = search_figures,
+            sources = sources,
+            threshold = 0.5,
+            search_only = search_only,
+            reports = reports,
+            min_size= 200,
+            k_documents= k_before_reranking,
+            k_images= k_by_question
+        )
     # Rerank
+    if reranker is not None and rerank_by_question:
         with suppress_output():
+            for key in docs_question_dict.keys():
+                docs_question_dict[key] = rerank_and_sort_docs(reranker,docs_question_dict[key],question)
     else:
         # Add a default reranking score
         for doc in docs_question:
             doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+    # Keep the right number of documents
+    docs_question, images_question = concatenate_documents(index, source_type, docs_question_dict, k_by_question, k_summary_by_question, k_images_by_question)
+    # Rerank the documents to put the most relevant in front
     if reranker is not None and rerank_by_question:
+        docs_question = rerank_and_sort_docs(reranker, docs_question, question)
     # Add sources used in the metadata
     docs_question = _add_sources_used_in_metadata(docs_question,sources,question,index)
     images_question = _add_sources_used_in_metadata(images_question,sources,question,index)
+    return docs_question, images_question
+async def retrieve_documents_for_all_questions(
+    search_figures,
+    search_only,
+    reports,
+    questions_list,
+    n_questions,
+    config,
+    source_type,
+    to_handle_questions_index,
+    vectorstore,
+    reranker,
+    rerank_by_question=True,
+    k_final=15,
+    k_before_reranking=100
+):
+    """
+    Retrieve documents in parallel for all questions.
+    """
+    # to_handle_questions_index = [x for x in state["questions_list"] if x["source_type"] == "IPx"]
+    # TODO split les questions selon le type de sources dans le state question + conditions sur le nombre de questions traités par type de source
+    # search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
+    # search_only = state["search_only"]
+    # reports = state["reports"]
+    # questions_list = state["questions_list"]
+    # k_by_question = k_final // state["n_questions"]["total"]
+    # k_summary_by_question = _get_k_summary_by_question(state["n_questions"]["total"])
+    # k_images_by_question = _get_k_images_by_question(state["n_questions"]["total"])
+    k_by_question = k_final // n_questions
+    k_summary_by_question = _get_k_summary_by_question(n_questions)
+    k_images_by_question = _get_k_images_by_question(n_questions)
+    k_before_reranking=100
+    tasks = [
+        retrieve_documents(
+            current_question=question,
+            config=config,
+            source_type=source_type,
+            vectorstore=vectorstore,
+            reranker=reranker,
+            search_figures=search_figures,
+            search_only=search_only,
+            reports=reports,
+            rerank_by_question=rerank_by_question,
+            k_images_by_question=k_images_by_question,
+            k_before_reranking=k_before_reranking,
+            k_by_question=k_by_question,
+            k_summary_by_question=k_summary_by_question
+        )
+        for i, question in enumerate(questions_list) if i in to_handle_questions_index
+    ]
+    results = await asyncio.gather(*tasks)
+    # Combine results
+    new_state = {"documents": [], "related_contents": [], "handled_questions_index": to_handle_questions_index}
+    for docs_question, images_question in results:
+        new_state["documents"].extend(docs_question)
+        new_state["related_contents"].extend(images_question)
     return new_state
+def make_IPx_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+    async def retrieve_IPx_docs(state, config):
+        source_type = "IPx"
+        IPx_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
+        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
+        search_only = state["search_only"]
+        reports = state["reports"]
+        questions_list = state["questions_list"]
+        n_questions=state["n_questions"]["total"]
+        state = await retrieve_documents_for_all_questions(
+            search_figures=search_figures,
+            search_only=search_only,
+            reports=reports,
+            questions_list=questions_list,
+            n_questions=n_questions,
+            config=config,
+            source_type=source_type,
+            to_handle_questions_index=IPx_questions_index,
+            vectorstore=vectorstore,
+            reranker=reranker,
+            rerank_by_question=rerank_by_question,
+            k_final=k_final,
+            k_before_reranking=k_before_reranking,
+        )
+        return state
+    return retrieve_IPx_docs
+def make_POC_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+    async def retrieve_POC_docs_node(state, config):
+        if "POC region" not in state["relevant_content_sources_selection"]  :
+            return {}
+        source_type = "POC"
+        POC_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
+        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
+        search_only = state["search_only"]
+        reports = state["reports"]
+        questions_list = state["questions_list"]
+        n_questions=state["n_questions"]["total"]
+        state = await retrieve_documents_for_all_questions(
+            search_figures=search_figures,
+            search_only=search_only,
+            reports=reports,
+            questions_list=questions_list,
+            n_questions=n_questions,
+            config=config,
+            source_type=source_type,
+            to_handle_questions_index=POC_questions_index,
+            vectorstore=vectorstore,
+            reranker=reranker,
+            rerank_by_question=rerank_by_question,
+            k_final=k_final,
+            k_before_reranking=k_before_reranking,
+        )
         return state
+    return retrieve_POC_docs_node

climateqa/engine/chains/retrieve_papers.py CHANGED Viewed

@@ -32,8 +32,8 @@ def generate_keywords(query):
     return keywords
-async def find_papers(query,after, relevant_content_sources, reranker= reranker):
-    if "OpenAlex" in relevant_content_sources:
         summary = ""
         keywords = generate_keywords(query)
         df_works = oa.search(keywords,after = after)

     return keywords
+async def find_papers(query,after, relevant_content_sources_selection, reranker= reranker):
+    if "Papers (OpenAlex)" in relevant_content_sources_selection:
         summary = ""
         keywords = generate_keywords(query)
         df_works = oa.search(keywords,after = after)

climateqa/engine/graph.py CHANGED Viewed

@@ -9,6 +9,9 @@ from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
 from typing_extensions import TypedDict
 from typing import List, Dict
 from IPython.display import display, HTML, Image
 from .chains.answer_chitchat import make_chitchat_node
@@ -16,10 +19,11 @@ from .chains.answer_ai_impact import make_ai_impact_node
 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
-from .chains.retrieve_documents import make_retriever_node
 from .chains.answer_rag import make_rag_node
 from .chains.graph_retriever import make_graph_retriever_node
 from .chains.chitchat_categorization import make_chitchat_intent_categorization_node
 # from .chains.set_defaults import set_defaults
 class GraphState(TypedDict):
@@ -31,25 +35,32 @@ class GraphState(TypedDict):
     intent : str
     search_graphs_chitchat : bool
     query: str
-    remaining_questions : List[dict]
     n_questions : int
     answer: str
     audience: str = "experts"
-    sources_input: List[str] = ["IPCC","IPBES"]
-    relevant_content_sources: List[str] = ["IPCC figures"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
-    documents: List[Document]
-    related_contents : Dict[str,Document]
-    recommended_content : List[Document]
     search_only : bool = False
 def search(state): #TODO
-    return state
 def answer_search(state):#TODO
-    return state
 def route_intent(state):
     intent = state["intent"]
@@ -59,12 +70,12 @@ def route_intent(state):
     #     return "answer_ai_impact"
     else:
         # Search route
-        return "search"
 def chitchat_route_intent(state):
     intent = state["search_graphs_chitchat"]
     if intent is True:
-        return "retrieve_graphs_chitchat"
     elif intent is False:
         return END
@@ -72,27 +83,50 @@ def route_translation(state):
     if state["language"].lower() == "english":
         return "transform_query"
     else:
-        return "translate_query"
 def route_based_on_relevant_docs(state,threshold_docs=0.2):
     docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
     if len(docs) > 0:
         return "answer_rag"
     else:
         return "answer_rag_no_docs"
-def route_retrieve_documents(state):
-    if state["search_only"] :
-        return END
-    elif len(state["remaining_questions"]) > 0:
         return "retrieve_documents"
     else:
-        return "answer_search"
 def make_id_dict(values):
     return {k:k for k in values}
-def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, threshold_docs=0.2):
     workflow = StateGraph(GraphState)
@@ -102,8 +136,9 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     translate_query = make_translation_node(llm)
     answer_chitchat = make_chitchat_node(llm)
     answer_ai_impact = make_ai_impact_node(llm)
-    retrieve_documents = make_retriever_node(vectorstore_ipcc, reranker, llm)
     retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
     answer_rag = make_rag_node(llm, with_docs=True)
     answer_rag_no_docs = make_rag_node(llm, with_docs=False)
     chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
@@ -111,13 +146,14 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     # Define the nodes
     # workflow.add_node("set_defaults", set_defaults)
     workflow.add_node("categorize_intent", categorize_intent)
-    workflow.add_node("search", search)
     workflow.add_node("answer_search", answer_search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
     workflow.add_node("answer_chitchat", answer_chitchat)
     workflow.add_node("chitchat_categorize_intent", chitchat_categorize_intent)
     workflow.add_node("retrieve_graphs", retrieve_graphs)
     workflow.add_node("retrieve_graphs_chitchat", retrieve_graphs)
     workflow.add_node("retrieve_documents", retrieve_documents)
     workflow.add_node("answer_rag", answer_rag)
@@ -130,7 +166,7 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     workflow.add_conditional_edges(
         "categorize_intent",
         route_intent,
-        make_id_dict(["answer_chitchat","search"])
     )
     workflow.add_conditional_edges(
@@ -140,15 +176,98 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     )
     workflow.add_conditional_edges(
-        "search",
         route_translation,
         make_id_dict(["translate_query","transform_query"])
     )
     workflow.add_conditional_edges(
-        "retrieve_documents",
-        # lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
         route_retrieve_documents,
-        make_id_dict([END,"retrieve_documents","answer_search"])
     )
     workflow.add_conditional_edges(
@@ -158,13 +277,15 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     )
     workflow.add_conditional_edges(
         "transform_query",
-        lambda state : "retrieve_graphs" if "OurWorldInData" in state["relevant_content_sources"]  else END,
         make_id_dict(["retrieve_graphs", END])
     )
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_documents")
     workflow.add_edge("retrieve_graphs", END)
     workflow.add_edge("answer_rag", END)
@@ -172,6 +293,12 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, reranker, thresh
     workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
     workflow.add_edge("retrieve_graphs_chitchat", END)
     # Compile
     app = workflow.compile()

 from typing_extensions import TypedDict
 from typing import List, Dict
+import operator
+from typing import Annotated
+import pandas as pd
 from IPython.display import display, HTML, Image
 from .chains.answer_chitchat import make_chitchat_node
 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
+from .chains.retrieve_documents import make_IPx_retriever_node, make_POC_retriever_node
 from .chains.answer_rag import make_rag_node
 from .chains.graph_retriever import make_graph_retriever_node
 from .chains.chitchat_categorization import make_chitchat_intent_categorization_node
+from .chains.drias_retriever import make_drias_retriever_node
 # from .chains.set_defaults import set_defaults
 class GraphState(TypedDict):
     intent : str
     search_graphs_chitchat : bool
     query: str
+    questions_list : List[dict]
+    handled_questions_index : Annotated[list[int], operator.add]
     n_questions : int
     answer: str
     audience: str = "experts"
+    sources_input: List[str] = ["IPCC","IPBES"] # Deprecated -> used only graphs that can only be OWID
+    relevant_content_sources_selection: List[str] = ["Figures (IPCC/IPBES)"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
+    documents: Annotated[List[Document], operator.add]
+    related_contents : Annotated[List[Document], operator.add] # Images
+    recommended_content : List[Document] # OWID Graphs  # TODO merge with related_contents
     search_only : bool = False
+    reports : List[str] = []
+    drias_data: pd.DataFrame
+    drias_sql_query : str
+def dummy(state):
+    return
 def search(state): #TODO
+    return
 def answer_search(state):#TODO
+    return
 def route_intent(state):
     intent = state["intent"]
     #     return "answer_ai_impact"
     else:
         # Search route
+        return "answer_climate"
 def chitchat_route_intent(state):
     intent = state["search_graphs_chitchat"]
     if intent is True:
+        return END #TODO
     elif intent is False:
         return END
     if state["language"].lower() == "english":
         return "transform_query"
     else:
+        return "transform_query"
+        # return "translate_query" #TODO : add translation
 def route_based_on_relevant_docs(state,threshold_docs=0.2):
     docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
+    print("Route : ", ["answer_rag" if len(docs) > 0 else "answer_rag_no_docs"])
     if len(docs) > 0:
         return "answer_rag"
     else:
         return "answer_rag_no_docs"
+def route_continue_retrieve_documents(state):
+    index_question_ipx = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
+    questions_ipx_finished = all(elem in state["handled_questions_index"] for elem in index_question_ipx)
+    if questions_ipx_finished:
+        return "end_retrieve_IPx_documents"
+    else:
         return "retrieve_documents"
+def route_continue_retrieve_local_documents(state):
+    index_question_poc = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
+    questions_poc_finished = all(elem in state["handled_questions_index"] for elem in index_question_poc)
+    # if questions_poc_finished and state["search_only"]:
+    #     return END
+    if questions_poc_finished or ("POC region" not in state["relevant_content_sources_selection"]):
+        return "end_retrieve_local_documents"
     else:
+        return "retrieve_local_data"
+def route_retrieve_documents(state):
+    sources_to_retrieve = []
+    if "Graphs (OurWorldInData)" in state["relevant_content_sources_selection"]  :
+        sources_to_retrieve.append("retrieve_graphs")
+    if sources_to_retrieve == []:
+        return END
+    return sources_to_retrieve
 def make_id_dict(values):
     return {k:k for k in values}
+def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_region, reranker, threshold_docs=0.2):
     workflow = StateGraph(GraphState)
     translate_query = make_translation_node(llm)
     answer_chitchat = make_chitchat_node(llm)
     answer_ai_impact = make_ai_impact_node(llm)
+    retrieve_documents = make_IPx_retriever_node(vectorstore_ipcc, reranker, llm)
     retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
+    # retrieve_local_data = make_POC_retriever_node(vectorstore_region, reranker, llm)
     answer_rag = make_rag_node(llm, with_docs=True)
     answer_rag_no_docs = make_rag_node(llm, with_docs=False)
     chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
     # Define the nodes
     # workflow.add_node("set_defaults", set_defaults)
     workflow.add_node("categorize_intent", categorize_intent)
+    workflow.add_node("answer_climate", dummy)
     workflow.add_node("answer_search", answer_search)
     workflow.add_node("transform_query", transform_query)
     workflow.add_node("translate_query", translate_query)
     workflow.add_node("answer_chitchat", answer_chitchat)
     workflow.add_node("chitchat_categorize_intent", chitchat_categorize_intent)
     workflow.add_node("retrieve_graphs", retrieve_graphs)
+    # workflow.add_node("retrieve_local_data", retrieve_local_data)
     workflow.add_node("retrieve_graphs_chitchat", retrieve_graphs)
     workflow.add_node("retrieve_documents", retrieve_documents)
     workflow.add_node("answer_rag", answer_rag)
     workflow.add_conditional_edges(
         "categorize_intent",
         route_intent,
+        make_id_dict(["answer_chitchat","answer_climate"])
     )
     workflow.add_conditional_edges(
     )
     workflow.add_conditional_edges(
+        "answer_climate",
         route_translation,
         make_id_dict(["translate_query","transform_query"])
     )
     workflow.add_conditional_edges(
+        "answer_search",
+        lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
+        make_id_dict(["answer_rag","answer_rag_no_docs"])
+    )
+    workflow.add_conditional_edges(
+        "transform_query",
         route_retrieve_documents,
+        make_id_dict(["retrieve_graphs", END])
+    )
+    # Define the edges
+    workflow.add_edge("translate_query", "transform_query")
+    workflow.add_edge("transform_query", "retrieve_documents") #TODO put back
+    # workflow.add_edge("transform_query", "retrieve_local_data")
+    # workflow.add_edge("transform_query", END) # TODO remove
+    workflow.add_edge("retrieve_graphs", END)
+    workflow.add_edge("answer_rag", END)
+    workflow.add_edge("answer_rag_no_docs", END)
+    workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
+    workflow.add_edge("retrieve_graphs_chitchat", END)
+    # workflow.add_edge("retrieve_local_data", "answer_search")
+    workflow.add_edge("retrieve_documents", "answer_search")
+    # Compile
+    app = workflow.compile()
+    return app
+def make_graph_agent_poc(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_region, reranker, threshold_docs=0.2):
+    workflow = StateGraph(GraphState)
+    # Define the node functions
+    categorize_intent = make_intent_categorization_node(llm)
+    transform_query = make_query_transform_node(llm)
+    translate_query = make_translation_node(llm)
+    answer_chitchat = make_chitchat_node(llm)
+    answer_ai_impact = make_ai_impact_node(llm)
+    retrieve_documents = make_IPx_retriever_node(vectorstore_ipcc, reranker, llm)
+    retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
+    retrieve_local_data = make_POC_retriever_node(vectorstore_region, reranker, llm)
+    answer_rag = make_rag_node(llm, with_docs=True)
+    answer_rag_no_docs = make_rag_node(llm, with_docs=False)
+    chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
+    # retrieve_drias_data = make_drias_retriever_node(llm) # WIP
+    # Define the nodes
+    # workflow.add_node("set_defaults", set_defaults)
+    workflow.add_node("categorize_intent", categorize_intent)
+    workflow.add_node("answer_climate", dummy)
+    workflow.add_node("answer_search", answer_search)
+    # workflow.add_node("end_retrieve_local_documents", dummy)
+    # workflow.add_node("end_retrieve_IPx_documents", dummy)
+    workflow.add_node("transform_query", transform_query)
+    workflow.add_node("translate_query", translate_query)
+    workflow.add_node("answer_chitchat", answer_chitchat)
+    workflow.add_node("chitchat_categorize_intent", chitchat_categorize_intent)
+    workflow.add_node("retrieve_graphs", retrieve_graphs)
+    workflow.add_node("retrieve_local_data", retrieve_local_data)
+    workflow.add_node("retrieve_graphs_chitchat", retrieve_graphs)
+    workflow.add_node("retrieve_documents", retrieve_documents)
+    workflow.add_node("answer_rag", answer_rag)
+    workflow.add_node("answer_rag_no_docs", answer_rag_no_docs)
+    # workflow.add_node("retrieve_drias_data", retrieve_drias_data)# WIP
+    # Entry point
+    workflow.set_entry_point("categorize_intent")
+    # CONDITIONAL EDGES
+    workflow.add_conditional_edges(
+        "categorize_intent",
+        route_intent,
+        make_id_dict(["answer_chitchat","answer_climate"])
+    )
+    workflow.add_conditional_edges(
+        "chitchat_categorize_intent",
+        chitchat_route_intent,
+        make_id_dict(["retrieve_graphs_chitchat", END])
+    )
+    workflow.add_conditional_edges(
+        "answer_climate",
+        route_translation,
+        make_id_dict(["translate_query","transform_query"])
     )
     workflow.add_conditional_edges(
     )
     workflow.add_conditional_edges(
         "transform_query",
+        route_retrieve_documents,
         make_id_dict(["retrieve_graphs", END])
     )
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
+    workflow.add_edge("transform_query", "retrieve_documents") #TODO put back
+    workflow.add_edge("transform_query", "retrieve_local_data")
+    # workflow.add_edge("transform_query", END) # TODO remove
     workflow.add_edge("retrieve_graphs", END)
     workflow.add_edge("answer_rag", END)
     workflow.add_edge("answer_chitchat", "chitchat_categorize_intent")
     workflow.add_edge("retrieve_graphs_chitchat", END)
+    workflow.add_edge("retrieve_local_data", "answer_search")
+    workflow.add_edge("retrieve_documents", "answer_search")
+    # workflow.add_edge("transform_query", "retrieve_drias_data")
+    # workflow.add_edge("retrieve_drias_data", END)
     # Compile
     app = workflow.compile()

climateqa/engine/reranker.py CHANGED Viewed

@@ -47,4 +47,9 @@ def rerank_docs(reranker,docs,query):
         doc.metadata["reranking_score"] = result.score
         doc.metadata["query_used_for_retrieval"] = query
         docs_reranked.append(doc)
     return docs_reranked

         doc.metadata["reranking_score"] = result.score
         doc.metadata["query_used_for_retrieval"] = query
         docs_reranked.append(doc)
+    return docs_reranked
+def rerank_and_sort_docs(reranker, docs, query):
+    docs_reranked = rerank_docs(reranker,docs,query)
+    docs_reranked = sorted(docs_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
     return docs_reranked

climateqa/engine/talk_to_data/main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from climateqa.engine.talk_to_data.myVanna import MyVanna
+from climateqa.engine.talk_to_data.utils import loc2coords, detect_location_with_openai, detectTable, nearestNeighbourSQL, detect_relevant_tables, replace_coordonates
+import sqlite3
+import os
+import pandas as pd
+from climateqa.engine.llm import get_llm
+from dotenv import load_dotenv
+import ast
+load_dotenv()
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+PC_API_KEY = os.getenv('VANNA_PINECONE_API_KEY')
+INDEX_NAME = os.getenv('VANNA_INDEX_NAME')
+VANNA_MODEL = os.getenv('VANNA_MODEL')
+#Vanna object
+vn = MyVanna(config = {"temperature": 0, "api_key": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME, "top_k" : 4})
+db_vanna_path = os.path.join(os.path.dirname(__file__), "database/drias.db")
+vn.connect_to_sqlite(db_vanna_path)
+llm = get_llm(provider="openai")
+def ask_llm_to_add_table_names(sql_query, llm):
+    sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query}. Just answer the query. The answer should not include ```sql\n").content
+    return sql_with_table_names
+def ask_llm_column_names(sql_query, llm):
+    columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query}").content
+    columns_list = ast.literal_eval(columns.strip("```python\n").strip())
+    return columns_list
+def ask_vanna(query):
+    try :
+        location = detect_location_with_openai(OPENAI_API_KEY, query)
+        if location:
+            coords = loc2coords(location)
+            user_input = query.lower().replace(location.lower(), f"lat, long : {coords}")
+            relevant_tables = detect_relevant_tables(user_input, llm)
+            coords_tables = [nearestNeighbourSQL(db_vanna_path, coords, relevant_tables[i]) for i in range(len(relevant_tables))]
+            user_input_with_coords = replace_coordonates(coords, user_input, coords_tables)
+            sql_query, result_dataframe, figure = vn.ask(user_input_with_coords, print_results=False, allow_llm_to_see_data=True, auto_train=False)
+            return sql_query, result_dataframe, figure
+        else :
+            empty_df = pd.DataFrame()
+            empty_fig = {}
+            return "", empty_df, empty_fig
+    except Exception as e:
+        print(f"Error: {e}")
+        empty_df = pd.DataFrame()
+        empty_fig = {}
+        return "", empty_df, empty_fig

climateqa/engine/talk_to_data/myVanna.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from dotenv import load_dotenv
+from climateqa.engine.talk_to_data.vanna_class import MyCustomVectorDB
+from vanna.openai import OpenAI_Chat
+import os
+load_dotenv()
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+class MyVanna(MyCustomVectorDB, OpenAI_Chat):
+    def __init__(self, config=None):
+        MyCustomVectorDB.__init__(self, config=config)
+        OpenAI_Chat.__init__(self, config=config)

climateqa/engine/talk_to_data/utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import re
+import openai
+import pandas as pd
+from geopy.geocoders import Nominatim
+import sqlite3
+import ast
+def detect_location_with_openai(api_key, sentence):
+    """
+    Detects locations in a sentence using OpenAI's API.
+    """
+    openai.api_key = api_key
+    prompt = f"""
+    Extract all locations (cities, countries, states, or geographical areas) mentioned in the following sentence.
+    Return the result as a Python list. If no locations are mentioned, return an empty list.
+    Sentence: "{sentence}"
+    """
+    response = openai.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant skilled in identifying locations in text."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=100,
+        temperature=0
+    )
+    return response.choices[0].message.content.split("\n")[1][2:-2]
+def detectTable(sql_query):
+    pattern = r'(?i)\bFROM\s+((?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+)(?:\.(?:`[^`]+`|"[^"]+"|\'[^\']+\'|\w+))*)'
+    matches = re.findall(pattern, sql_query)
+    return matches
+def loc2coords(location : str):
+    geolocator = Nominatim(user_agent="city_to_latlong")
+    location = geolocator.geocode(location)
+    return (location.latitude, location.longitude)
+def coords2loc(coords : tuple):
+    geolocator = Nominatim(user_agent="coords_to_city")
+    try:
+        location = geolocator.reverse(coords)
+        return location.address
+    except Exception as e:
+        print(f"Error: {e}")
+        return "Unknown Location"
+def nearestNeighbourSQL(db: str, location: tuple, table : str):
+    conn = sqlite3.connect(db)
+    long = round(location[1], 3)
+    lat = round(location[0], 3)
+    cursor  = conn.cursor()
+    cursor.execute(f"SELECT lat, lon FROM {table} WHERE lat BETWEEN {lat - 0.3} AND {lat + 0.3} AND lon BETWEEN {long - 0.3} AND {long + 0.3}")
+    results = cursor.fetchall()
+    return results[0]
+def detect_relevant_tables(user_question, llm):
+    table_names_list = [
+        "Frequency_of_rainy_days_index",
+        "Winter_precipitation_total",
+        "Summer_precipitation_total",
+        "Annual_precipitation_total",
+        # "Remarkable_daily_precipitation_total_(Q99)",
+        "Frequency_of_remarkable_daily_precipitation",
+        "Extreme_precipitation_intensity",
+        "Mean_winter_temperature",
+        "Mean_summer_temperature",
+        "Number_of_tropical_nights",
+        "Maximum_summer_temperature",
+        "Number_of_days_with_Tx_above_30C",
+        "Number_of_days_with_Tx_above_35C",
+        "Drought_index"
+    ]
+    prompt = (
+        f"You are helping to build a sql query to retrieve relevant data for a user question."
+        f"The different tables are {table_names_list}."
+        f"The user question is {user_question}. Write the relevant tables to use. Answer only a python list of table name."
+    )
+    table_names = ast.literal_eval(llm.invoke(prompt).content.strip("```python\n").strip())
+    return table_names
+def replace_coordonates(coords, query, coords_tables):
+    n = query.count(str(coords[0]))
+    for i in range(n):
+        query = query.replace(str(coords[0]), str(coords_tables[i][0]),1)
+        query = query.replace(str(coords[1]), str(coords_tables[i][1]),1)
+    return query

climateqa/engine/talk_to_data/vanna_class.py ADDED Viewed

	@@ -0,0 +1,325 @@

+from vanna.base import VannaBase
+from pinecone import Pinecone
+from climateqa.engine.embeddings import get_embeddings_function
+import pandas as pd
+import hashlib
+class MyCustomVectorDB(VannaBase):
+    """
+    VectorDB class for storing and retrieving vectors from Pinecone.
+    args :
+        config (dict) : Configuration dictionary containing the Pinecone API key and the index name :
+            - pc_api_key (str) : Pinecone API key
+            - index_name (str) : Pinecone index name
+            - top_k (int) : Number of top results to return (default = 2)
+    """
+    def __init__(self,config):
+        super().__init__(config = config)
+        try :
+            self.api_key = config.get('pc_api_key')
+            self.index_name = config.get('index_name')
+        except :
+            raise Exception("Please provide the Pinecone API key and the index name")
+        self.pc = Pinecone(api_key = self.api_key)
+        self.index = self.pc.Index(self.index_name)
+        self.top_k = config.get('top_k', 2)
+        self.embeddings = get_embeddings_function()
+    def check_embedding(self, id, namespace):
+        fetched = self.index.fetch(ids = [id], namespace = namespace)
+        if fetched['vectors'] == {}:
+            return False
+        return True
+    def generate_hash_id(self, data: str) -> str:
+        """
+        Generate a unique hash ID for the given data.
+        Args:
+            data (str): The input data to hash (e.g., a concatenated string of user attributes).
+        Returns:
+            str: A unique hash ID as a hexadecimal string.
+        """
+        data_bytes = data.encode('utf-8')
+        hash_object = hashlib.sha256(data_bytes)
+        hash_id = hash_object.hexdigest()
+        return hash_id
+    def add_ddl(self, ddl: str, **kwargs) -> str:
+        id = self.generate_hash_id(ddl) + '_ddl'
+        if self.check_embedding(id, 'ddl'):
+            print(f"DDL having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(ddl), {'ddl': ddl})],
+            namespace = 'ddl'
+        )
+        return id
+    def add_documentation(self, doc: str, **kwargs) -> str:
+        id = self.generate_hash_id(doc) + '_doc'
+        if self.check_embedding(id, 'documentation'):
+            print(f"Documentation having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(doc), {'doc': doc})],
+            namespace = 'documentation'
+        )
+        return id
+    def add_question_sql(self, question: str, sql: str, **kwargs) -> str:
+        id = self.generate_hash_id(question) + '_sql'
+        if self.check_embedding(id, 'question_sql'):
+            print(f"Question-SQL pair having id {id} already exists")
+            return id
+        self.index.upsert(
+            vectors = [(id, self.embeddings.embed_query(question + sql), {'question': question, 'sql': sql})],
+            namespace = 'question_sql'
+        )
+        return id
+    def get_related_ddl(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='ddl',
+            include_metadata=True
+        )
+        return [match['metadata']['ddl'] for match in res['matches']]
+    def get_related_documentation(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='documentation',
+            include_metadata=True
+        )
+        return [match['metadata']['doc'] for match in res['matches']]
+    def get_similar_question_sql(self, question: str, **kwargs) -> list:
+        res = self.index.query(
+            vector=self.embeddings.embed_query(question),
+            top_k=self.top_k,
+            namespace='question_sql',
+            include_metadata=True
+        )
+        return [(match['metadata']['question'], match['metadata']['sql']) for match in res['matches']]
+    def get_training_data(self, **kwargs) -> pd.DataFrame:
+        list_of_data = []
+        namespaces = ['ddl', 'documentation', 'question_sql']
+        for namespace in namespaces:
+            data = self.index.query(
+            top_k=10000,
+            namespace=namespace,
+            include_metadata=True,
+            include_values=False
+            )
+            for match in data['matches']:
+                list_of_data.append(match['metadata'])
+        return pd.DataFrame(list_of_data)
+    def remove_training_data(self, id: str, **kwargs) -> bool:
+        if id.endswith("_ddl"):
+            self.Index.delete(ids=[id], namespace="_ddl")
+            return True
+        if id.endswith("_sql"):
+            self.index.delete(ids=[id], namespace="_sql")
+            return True
+        if id.endswith("_doc"):
+            self.Index.delete(ids=[id], namespace="_doc")
+            return True
+        return False
+    def generate_embedding(self, text, **kwargs):
+        # Implement the method here
+        pass
+    def get_sql_prompt(
+            self,
+            initial_prompt : str,
+            question: str,
+            question_sql_list: list,
+            ddl_list: list,
+            doc_list: list,
+            **kwargs,
+        ):
+            """
+            Example:
+            ```python
+            vn.get_sql_prompt(
+                question="What are the top 10 customers by sales?",
+                question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
+                ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
+                doc_list=["The customers table contains information about customers and their sales."],
+            )
+            ```
+            This method is used to generate a prompt for the LLM to generate SQL.
+            Args:
+                question (str): The question to generate SQL for.
+                question_sql_list (list): A list of questions and their corresponding SQL statements.
+                ddl_list (list): A list of DDL statements.
+                doc_list (list): A list of documentation.
+            Returns:
+                any: The prompt for the LLM to generate SQL.
+            """
+            if initial_prompt is None:
+                initial_prompt = f"You are a {self.dialect} expert. " + \
+                "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
+            initial_prompt = self.add_ddl_to_prompt(
+                initial_prompt, ddl_list, max_tokens=self.max_tokens
+            )
+            if self.static_documentation != "":
+                doc_list.append(self.static_documentation)
+            initial_prompt = self.add_documentation_to_prompt(
+                initial_prompt, doc_list, max_tokens=self.max_tokens
+            )
+            # initial_prompt = self.add_sql_to_prompt(
+            #     initial_prompt, question_sql_list, max_tokens=self.max_tokens
+            # )
+            initial_prompt += (
+                "===Response Guidelines \n"
+                "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
+                "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
+                "3. If the provided context is insufficient, please give a sql query based on your knowledge and the context provided. \n"
+                "4. Please use the most relevant table(s). \n"
+                "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
+                f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
+                f"7. Add a description of the table in the result of the sql query, if relevant. \n"
+                "8 Make sure to include the relevant KPI in the SQL query. The query should return impactfull data \n"
+                # f"8. If a set of latitude,longitude is provided, make a intermediate query to find the nearest value in the table and replace the coordinates in the sql query. \n"
+                # "7. Add a description of the table in the result of the sql query."
+                # "7. If the question is about a specific latitude, longitude, query an interval of 0.3 and keep only the first set of coordinate. \n"
+                # "7. Table names should be included in the result of the sql query. Use for example Mean_winter_temperature AS table_name in the query \n"
+            )
+            message_log = [self.system_message(initial_prompt)]
+            for example in question_sql_list:
+                if example is None:
+                    print("example is None")
+                else:
+                    if example is not None and "question" in example and "sql" in example:
+                        message_log.append(self.user_message(example["question"]))
+                        message_log.append(self.assistant_message(example["sql"]))
+            message_log.append(self.user_message(question))
+            return message_log
+# def get_sql_prompt(
+#         self,
+#         initial_prompt : str,
+#         question: str,
+#         question_sql_list: list,
+#         ddl_list: list,
+#         doc_list: list,
+#         **kwargs,
+#     ):
+#         """
+#         Example:
+#         ```python
+#         vn.get_sql_prompt(
+#             question="What are the top 10 customers by sales?",
+#             question_sql_list=[{"question": "What are the top 10 customers by sales?", "sql": "SELECT * FROM customers ORDER BY sales DESC LIMIT 10"}],
+#             ddl_list=["CREATE TABLE customers (id INT, name TEXT, sales DECIMAL)"],
+#             doc_list=["The customers table contains information about customers and their sales."],
+#         )
+#         ```
+#         This method is used to generate a prompt for the LLM to generate SQL.
+#         Args:
+#             question (str): The question to generate SQL for.
+#             question_sql_list (list): A list of questions and their corresponding SQL statements.
+#             ddl_list (list): A list of DDL statements.
+#             doc_list (list): A list of documentation.
+#         Returns:
+#             any: The prompt for the LLM to generate SQL.
+#         """
+#         if initial_prompt is None:
+#             initial_prompt = f"You are a {self.dialect} expert. " + \
+#             "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. "
+#         initial_prompt = self.add_ddl_to_prompt(
+#             initial_prompt, ddl_list, max_tokens=self.max_tokens
+#         )
+#         if self.static_documentation != "":
+#             doc_list.append(self.static_documentation)
+#         initial_prompt = self.add_documentation_to_prompt(
+#             initial_prompt, doc_list, max_tokens=self.max_tokens
+#         )
+#         initial_prompt += (
+#             "===Response Guidelines \n"
+#             "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
+#             "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
+#             "3. If the provided context is insufficient, please explain why it can't be generated. \n"
+#             "4. Please use the most relevant table(s). \n"
+#             "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
+#             f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
+#         )
+#         message_log = [self.system_message(initial_prompt)]
+#         for example in question_sql_list:
+#             if example is None:
+#                 print("example is None")
+#             else:
+#                 if example is not None and "question" in example and "sql" in example:
+#                     message_log.append(self.user_message(example["question"]))
+#                     message_log.append(self.assistant_message(example["sql"]))
+#         message_log.append(self.user_message(question))
+#         return message_log

climateqa/{event_handler.py → handle_stream_events.py} RENAMED Viewed

@@ -15,7 +15,14 @@ def init_audience(audience :str) -> str:
         audience_prompt = audience_prompts["experts"]
     return audience_prompt
-def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage], used_documents : list[str]) -> tuple[str, list[ChatMessage], list[str]]:
     """
     Handles the retrieved documents and returns the HTML representation of the documents
@@ -27,26 +34,22 @@ def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage],
     Returns:
         tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
     """
     try:
-        docs = event["data"]["output"]["documents"]
-        docs_html = []
-        textual_docs = [d for d in docs if d.metadata["chunk_type"] == "text"]
-        for i, d in enumerate(textual_docs, 1):
-            if d.metadata["chunk_type"] == "text":
-                docs_html.append(make_html_source(d, i))
         used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
         if used_documents!=[]:
             history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
-        docs_html = "".join(docs_html)
-        related_contents = event["data"]["output"]["related_contents"]
     except Exception as e:
         print(f"Error getting documents: {e}")
         print(event)
-    return docs, docs_html, history, used_documents, related_contents
 def stream_answer(history: list[ChatMessage], event : StreamEvent, start_streaming : bool, answer_message_content : str)-> tuple[list[ChatMessage], bool, str]:
     """

         audience_prompt = audience_prompts["experts"]
     return audience_prompt
+def convert_to_docs_to_html(docs: list[dict]) -> str:
+    docs_html = []
+    for i, d in enumerate(docs, 1):
+        if d.metadata["chunk_type"] == "text":
+            docs_html.append(make_html_source(d, i))
+    return "".join(docs_html)
+def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage], used_documents : list[str],related_content:list[str]) -> tuple[str, list[ChatMessage], list[str]]:
     """
     Handles the retrieved documents and returns the HTML representation of the documents
     Returns:
         tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
     """
+    if "documents" not in event["data"]["output"] or event["data"]["output"]["documents"] == []:
+        return history, used_documents, related_content
     try:
+        docs = event["data"]["output"]["documents"]
         used_documents = used_documents + [f"{d.metadata['short_name']} - {d.metadata['name']}" for d in docs]
         if used_documents!=[]:
             history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
+        #TODO do the same for related contents
     except Exception as e:
         print(f"Error getting documents: {e}")
         print(event)
+    return history, used_documents, related_content
 def stream_answer(history: list[ChatMessage], event : StreamEvent, start_streaming : bool, answer_message_content : str)-> tuple[list[ChatMessage], bool, str]:
     """

front/deprecated.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Functions to toggle visibility
+def toggle_summary_visibility():
+    global summary_visible
+    summary_visible = not summary_visible
+    return gr.update(visible=summary_visible)
+def toggle_relevant_visibility():
+    global relevant_visible
+    relevant_visible = not relevant_visible
+    return gr.update(visible=relevant_visible)
+def change_completion_status(current_state):
+    current_state = 1 - current_state
+    return current_state
+def vote(data: gr.LikeData):
+    if data.liked:
+        print(data.value)
+    else:
+        print(data)
+def save_graph(saved_graphs_state, embedding, category):
+    print(f"\nCategory:\n{saved_graphs_state}\n")
+    if category not in saved_graphs_state:
+        saved_graphs_state[category] = []
+    if embedding not in saved_graphs_state[category]:
+        saved_graphs_state[category].append(embedding)
+    return saved_graphs_state, gr.Button("Graph Saved")
+# Function to save feedback
+def save_feedback(feed: str, user_id):
+    if len(feed) > 1:
+        timestamp = str(datetime.now().timestamp())
+        file = user_id + timestamp + ".json"
+        logs = {
+            "user_id": user_id,
+            "feedback": feed,
+            "time": timestamp,
+        }
+        log_on_azure(file, logs, share_client)
+        return "Feedback submitted, thank you!"

front/event_listeners.py ADDED Viewed

File without changes

front/tabs/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .tab_config import create_config_modal
+from .tab_examples import create_examples_tab
+from .tab_papers import create_papers_tab
+from .tab_figures import create_figures_tab
+from .chat_interface import create_chat_interface
+from .tab_about import create_about_tab

front/tabs/chat_interface.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gradio as gr
+from gradio.components import ChatMessage
+# Initialize prompt and system template
+init_prompt = """
+Hello, I am ClimateQ&A, a conversational assistant designed to help you understand climate change and biodiversity loss. I will answer your questions by **sifting through the IPCC and IPBES scientific reports**.
+❓ How to use
+- **Language**: You can ask me your questions in any language.
+- **Audience**: You can specify your audience (children, general public, experts) to get a more adapted answer.
+- **Sources**: You can choose to search in the IPCC or IPBES reports, or both.
+- **Relevant content sources**: You can choose to search for figures, papers, or graphs that can be relevant for your question.
+⚠️ Limitations
+*Please note that the AI is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
+🛈 Information
+Please note that we log your questions for meta-analysis purposes, so avoid sharing any sensitive or personal information.
+What do you want to learn ?
+"""
+init_prompt_poc = """
+Hello, I am ClimateQ&A, a conversational assistant designed to help you understand climate change and biodiversity loss. I will answer your questions by **sifting through the IPCC and IPBES scientific reports, PCAET of Paris, the Plan Biodiversité 2018-2024, and Acclimaterra reports from la Région Nouvelle-Aquitaine **.
+❓ How to use
+- **Language**: You can ask me your questions in any language.
+- **Audience**: You can specify your audience (children, general public, experts) to get a more adapted answer.
+- **Sources**: You can choose to search in the IPCC or IPBES reports, and POC sources for local documents (PCAET, Plan Biodiversité, Acclimaterra).
+- **Relevant content sources**: You can choose to search for figures, papers, or graphs that can be relevant for your question.
+⚠️ Limitations
+*Please note that the AI is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
+🛈 Information
+Please note that we log your questions for meta-analysis purposes, so avoid sharing any sensitive or personal information.
+What do you want to learn ?
+"""
+# UI Layout Components
+def create_chat_interface(tab):
+    init_prompt_message = init_prompt_poc if tab == "Beta - POC Adapt'Action" else init_prompt
+    chatbot = gr.Chatbot(
+        value=[ChatMessage(role="assistant", content=init_prompt_message)],
+        type="messages",
+        show_copy_button=True,
+        show_label=False,
+        elem_id="chatbot",
+        layout="panel",
+        avatar_images=(None, "https://i.ibb.co/YNyd5W2/logo4.png"),
+        max_height="80vh",
+        height="100vh"
+    )
+    with gr.Row(elem_id="input-message"):
+        textbox = gr.Textbox(
+            placeholder="Ask me anything here!",
+            show_label=False,
+            scale=12,
+            lines=1,
+            interactive=True,
+            elem_id=f"input-textbox"
+        )
+        config_button = gr.Button("", elem_id="config-button")
+    return chatbot, textbox, config_button

front/tabs/main_tab.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import gradio as gr
+from .chat_interface import create_chat_interface
+from .tab_examples import create_examples_tab
+from .tab_papers import create_papers_tab
+from .tab_figures import create_figures_tab
+def cqa_tab(tab_name):
+    # State variables
+    current_graphs = gr.State([])
+    with gr.Tab(tab_name):
+        with gr.Row(elem_id="chatbot-row"):
+            # Left column - Chat interface
+            with gr.Column(scale=2):
+                chatbot, textbox, config_button = create_chat_interface(tab_name)
+            # Right column - Content panels
+            with gr.Column(scale=2, variant="panel", elem_id="right-panel"):
+                with gr.Tabs(elem_id="right_panel_tab") as tabs:
+                    # Examples tab
+                    with gr.TabItem("Examples", elem_id="tab-examples", id=0):
+                        examples_hidden, dropdown_samples, samples = create_examples_tab()
+                    # Sources tab
+                    with gr.Tab("Sources", elem_id="tab-sources", id=1) as tab_sources:
+                        sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
+                    # Recommended content tab
+                    with gr.Tab("Recommended content", elem_id="tab-recommended_content", id=2) as tab_recommended_content:
+                        with gr.Tabs(elem_id="group-subtabs") as tabs_recommended_content:
+                            # Figures subtab
+                            with gr.Tab("Figures", elem_id="tab-figures", id=3) as tab_figures:
+                                sources_raw, new_figures, used_figures, gallery_component, figures_cards, figure_modal = create_figures_tab()
+                            # Papers subtab
+                            with gr.Tab("Papers", elem_id="tab-citations", id=4) as tab_papers:
+                                papers_summary, papers_html, citations_network, papers_modal = create_papers_tab()
+                            # Graphs subtab
+                            with gr.Tab("Graphs", elem_id="tab-graphs", id=5) as tab_graphs:
+                                graphs_container = gr.HTML(
+                                    "<h2>There are no graphs to be displayed at the moment. Try asking another question.</h2>",
+                                    elem_id="graphs-container"
+                                )
+    return {
+        "chatbot": chatbot,
+        "textbox": textbox,
+        "tabs": tabs,
+        "sources_raw": sources_raw,
+        "new_figures": new_figures,
+        "current_graphs": current_graphs,
+        "examples_hidden": examples_hidden,
+        "dropdown_samples": dropdown_samples,
+        "samples": samples,
+        "sources_textbox": sources_textbox,
+        "figures_cards": figures_cards,
+        "gallery_component": gallery_component,
+        "config_button": config_button,
+        "papers_html": papers_html,
+        "citations_network": citations_network,
+        "papers_summary": papers_summary,
+        "tab_recommended_content": tab_recommended_content,
+        "tab_sources": tab_sources,
+        "tab_figures": tab_figures,
+        "tab_graphs": tab_graphs,
+        "tab_papers": tab_papers,
+        "graph_container": graphs_container
+    }

front/tabs/tab_about.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+# Citation information
+CITATION_LABEL = "BibTeX citation for ClimateQ&A"
+CITATION_TEXT = r"""@misc{climateqa,
+    author={Théo Alves Da Costa, Timothée Bohe},
+    title={ClimateQ&A, AI-powered conversational assistant for climate change and biodiversity loss},
+    year={2024},
+    howpublished= {\url{https://climateqa.com}},
+}
+@software{climateqa,
+    author = {Théo Alves Da Costa, Timothée Bohe},
+    publisher = {ClimateQ&A},
+    title = {ClimateQ&A, AI-powered conversational assistant for climate change and biodiversity loss},
+}
+"""
+def create_about_tab():
+    with gr.Tab("About", elem_classes="max-height other-tabs"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown(
+                    """
+                    ### More info
+                    - See more info at [https://climateqa.com](https://climateqa.com/docs/intro/)
+                    - Feedbacks on this [form](https://forms.office.com/e/1Yzgxm6jbp)
+                    ### Citation
+                    """
+                )
+                with gr.Accordion(CITATION_LABEL, elem_id="citation", open=False):
+                    gr.Textbox(
+                        value=CITATION_TEXT,
+                        label="",
+                        interactive=False,
+                        show_copy_button=True,
+                        lines=len(CITATION_TEXT.split('\n')),
+                    )

front/tabs/tab_config.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+from gradio_modal import Modal
+from climateqa.constants import POSSIBLE_REPORTS
+from typing import TypedDict
+class ConfigPanel(TypedDict):
+    config_open: gr.State
+    config_modal: Modal
+    dropdown_sources: gr.CheckboxGroup
+    dropdown_reports: gr.Dropdown
+    dropdown_external_sources: gr.CheckboxGroup
+    search_only: gr.Checkbox
+    dropdown_audience: gr.Dropdown
+    after: gr.Slider
+    output_query: gr.Textbox
+    output_language: gr.Textbox
+def create_config_modal():
+    config_open = gr.State(value=True)
+    with Modal(visible=False, elem_id="modal-config") as config_modal:
+        gr.Markdown("Reminders: You can talk in any language, ClimateQ&A is multi-lingual!")
+        dropdown_sources = gr.CheckboxGroup(
+            choices=["IPCC", "IPBES", "IPOS"],
+            label="Select source (by default search in all sources)",
+            value=["IPCC"],
+            interactive=True
+        )
+        dropdown_reports = gr.Dropdown(
+            choices=POSSIBLE_REPORTS,
+            label="Or select specific reports",
+            multiselect=True,
+            value=None,
+            interactive=True
+        )
+        dropdown_external_sources = gr.CheckboxGroup(
+            choices=["Figures (IPCC/IPBES)", "Papers (OpenAlex)", "Graphs (OurWorldInData)","POC region"],
+            label="Select database to search for relevant content",
+            value=["Figures (IPCC/IPBES)","POC region"],
+            interactive=True
+        )
+        search_only = gr.Checkbox(
+            label="Search only for recommended content without chating",
+            value=False,
+            interactive=True,
+            elem_id="checkbox-chat"
+        )
+        dropdown_audience = gr.Dropdown(
+            choices=["Children", "General public", "Experts"],
+            label="Select audience",
+            value="Experts",
+            interactive=True
+        )
+        after = gr.Slider(
+            minimum=1950,
+            maximum=2023,
+            step=1,
+            value=1960,
+            label="Publication date",
+            show_label=True,
+            interactive=True,
+            elem_id="date-papers",
+            visible=False
+        )
+        output_query = gr.Textbox(
+            label="Query used for retrieval",
+            show_label=True,
+            elem_id="reformulated-query",
+            lines=2,
+            interactive=False,
+            visible=False
+        )
+        output_language = gr.Textbox(
+            label="Language",
+            show_label=True,
+            elem_id="language",
+            lines=1,
+            interactive=False,
+            visible=False
+        )
+        dropdown_external_sources.change(
+            lambda x: gr.update(visible="Papers (OpenAlex)" in x),
+            inputs=[dropdown_external_sources],
+            outputs=[after]
+        )
+        close_config_modal_button = gr.Button("Validate and Close", elem_id="close-config-modal")
+        # return ConfigPanel(
+        #     config_open=config_open,
+        #     config_modal=config_modal,
+        #     dropdown_sources=dropdown_sources,
+        #     dropdown_reports=dropdown_reports,
+        #     dropdown_external_sources=dropdown_external_sources,
+        #     search_only=search_only,
+        #     dropdown_audience=dropdown_audience,
+        #     after=after,
+        #     output_query=output_query,
+        #     output_language=output_language
+        # )
+        return {
+            "config_open" : config_open,
+            "config_modal": config_modal,
+            "dropdown_sources": dropdown_sources,
+            "dropdown_reports": dropdown_reports,
+            "dropdown_external_sources": dropdown_external_sources,
+            "search_only": search_only,
+            "dropdown_audience": dropdown_audience,
+            "after": after,
+            "output_query": output_query,
+            "output_language": output_language,
+            "close_config_modal_button": close_config_modal_button
+        }

front/tabs/tab_examples.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import gradio as gr
+from climateqa.sample_questions import QUESTIONS
+def create_examples_tab():
+    examples_hidden = gr.Textbox(visible=False, elem_id=f"examples-hidden")
+    first_key = list(QUESTIONS.keys())[0]
+    dropdown_samples = gr.Dropdown(
+        choices=QUESTIONS.keys(),
+        value=first_key,
+        interactive=True,
+        label="Select a category of sample questions",
+        elem_id="dropdown-samples"
+    )
+    samples = []
+    for i, key in enumerate(QUESTIONS.keys()):
+        examples_visible = (i == 0)
+        with gr.Row(visible=examples_visible) as group_examples:
+            examples_questions = gr.Examples(
+                examples=QUESTIONS[key],
+                inputs=[examples_hidden],
+                examples_per_page=8,
+                run_on_click=False,
+                elem_id=f"examples{i}",
+                api_name=f"examples{i}"
+            )
+        samples.append(group_examples)
+    def change_sample_questions(key):
+        index = list(QUESTIONS.keys()).index(key)
+        visible_bools = [False] * len(samples)
+        visible_bools[index] = True
+        return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
+    # event listener
+    dropdown_samples.change(change_sample_questions, dropdown_samples, samples)
+    return examples_hidden

front/tabs/tab_figures.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import gradio as gr
+from gradio_modal import Modal
+def create_figures_tab():
+    sources_raw = gr.State()
+    new_figures = gr.State([])
+    used_figures = gr.State([])
+    with Modal(visible=False, elem_id="modal_figure_galery") as figure_modal:
+        gallery_component = gr.Gallery(
+            object_fit='scale-down',
+            elem_id="gallery-component",
+            height="80vh"
+        )
+    show_full_size_figures = gr.Button(
+        "Show figures in full size",
+        elem_id="show-figures",
+        interactive=True
+    )
+    show_full_size_figures.click(
+        lambda: Modal(visible=True),
+        None,
+        figure_modal
+    )
+    figures_cards = gr.HTML(show_label=False, elem_id="sources-figures")
+    return sources_raw, new_figures, used_figures, gallery_component, figures_cards, figure_modal

front/tabs/tab_papers.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+from gradio_modal import Modal
+def create_papers_tab():
+    direct_search_textbox = gr.Textbox(label="Direct search for papers", placeholder= "What is climate change ?", elem_id="papers-search")
+    with gr.Accordion(
+        visible=True,
+        elem_id="papers-summary-popup",
+        label="See summary of relevant papers",
+        open=False
+    ) as summary_popup:
+        papers_summary = gr.Markdown("", visible=True, elem_id="papers-summary")
+    with gr.Accordion(
+        visible=True,
+        elem_id="papers-relevant-popup",
+        label="See relevant papers",
+        open=False
+    ) as relevant_popup:
+        papers_html = gr.HTML(show_label=False, elem_id="papers-textbox")
+    btn_citations_network = gr.Button("Explore papers citations network")
+    with Modal(visible=False) as papers_modal:
+        citations_network = gr.HTML(
+            "<h3>Citations Network Graph</h3>",
+            visible=True,
+            elem_id="papers-citations-network"
+        )
+    btn_citations_network.click(
+        lambda: Modal(visible=True),
+        None,
+        papers_modal
+    )
+    return direct_search_textbox, papers_summary, papers_html, citations_network, papers_modal

front/tabs/tab_recommended_content.py ADDED Viewed

File without changes

front/utils.py CHANGED Viewed

@@ -39,23 +39,33 @@ def parse_output_llm_with_sources(output:str)->str:
     content_parts = "".join(parts)
     return content_parts
-def process_figures(docs:list)->tuple:
-    gallery=[]
-    used_figures =[]
     figures = '<div class="figures-container"><p></p> </div>'
     docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
-    for i, doc in enumerate(docs_figures):
-        if doc.metadata["chunk_type"] == "image":
-            if doc.metadata["figure_code"] != "N/A":
-                title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}"
-            else:
-                title = f"{doc.metadata['short_name']}"
-            if title not in used_figures:
-                used_figures.append(title)
                 try:
-                    key = f"Image {i+1}"
                     image_path = doc.metadata["image_path"].split("documents/")[1]
                     img = get_image_from_azure_blob_storage(image_path)
@@ -68,12 +78,12 @@ def process_figures(docs:list)->tuple:
                     img_str = base64.b64encode(buffered.getvalue()).decode()
-                    figures = figures + make_html_figure_sources(doc, i, img_str)
                     gallery.append(img)
                 except Exception as e:
-                    print(f"Skipped adding image {i} because of {e}")
-    return figures, gallery
 def generate_html_graphs(graphs:list)->str:

     content_parts = "".join(parts)
     return content_parts
+def process_figures(docs:list, new_figures:list)->tuple:
+    if new_figures == []:
+        return docs, "", []
+    docs = docs + new_figures
     figures = '<div class="figures-container"><p></p> </div>'
+    gallery = []
+    used_figures = []
+    if docs == []:
+        return docs, figures, gallery
     docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
+    for i_doc, doc in enumerate(docs_figures):
+        if doc.metadata["chunk_type"] == "image":
+            path = doc.metadata["image_path"]
+            if path not in used_figures:
+                used_figures.append(path)
+                figure_number = len(used_figures)
                 try:
+                    key = f"Image {figure_number}"
                     image_path = doc.metadata["image_path"].split("documents/")[1]
                     img = get_image_from_azure_blob_storage(image_path)
                     img_str = base64.b64encode(buffered.getvalue()).decode()
+                    figures = figures + make_html_figure_sources(doc, figure_number, img_str)
                     gallery.append(img)
                 except Exception as e:
+                    print(f"Skipped adding image {figure_number} because of {e}")
+    return docs, figures, gallery
 def generate_html_graphs(graphs:list)->str:

requirements.txt CHANGED Viewed

@@ -4,7 +4,7 @@ azure-storage-blob
 python-dotenv==1.0.0
 langchain==0.2.1
 langchain_openai==0.1.7
-langgraph==0.0.55
 pinecone-client==4.1.0
 sentence-transformers==2.6.0
 huggingface-hub
@@ -19,3 +19,5 @@ langchain-community==0.2
 msal==1.31
 matplotlib==3.9.2
 gradio-modal==0.0.4

 python-dotenv==1.0.0
 langchain==0.2.1
 langchain_openai==0.1.7
+langgraph==0.2.70
 pinecone-client==4.1.0
 sentence-transformers==2.6.0
 huggingface-hub
 msal==1.31
 matplotlib==3.9.2
 gradio-modal==0.0.4
+vanna==0.7.5
+geopy==2.4.1

sandbox/20241104 - CQA - StepByStep CQA.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

sandbox/talk_to_data/20250306 - CQA - Drias.ipynb ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import the function in main.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from climateqa.engine.talk_to_data.main import ask_vanna\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a human query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# query = \"Compare the winter and summer precipitation in 2050 in Marseille\"\n",
+    "# query = \"What is the impact of climate  in Bordeaux?\"\n",
+    "# query = \"what is the number of days where the temperature above 35 in 2050 in Marseille\"\n",
+    "# query = \"Quelle sera la température à Marseille sur les prochaines années ?\"\n",
+    "# query = \"Comment vont évoluer les températures à Marseille ?\"\n",
+    "query = \"Comment vont évoluer les températures à marseille ?\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Call the function ask vanna, it gives an output of a the sql query and the dataframe of the result (tuple)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sql_query, df, fig = ask_vanna(query)\n",
+    "print(df.head())\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "climateqa",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sandbox/talk_to_data/20250306 - CQA - Step_by_step_vanna.ipynb ADDED Viewed

	@@ -0,0 +1,201 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from climateqa.engine.talk_to_data.main import ask_vanna\n",
+    "\n",
+    "import sqlite3\n",
+    "import os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from climateqa.engine.talk_to_data.myVanna import MyVanna\n",
+    "from climateqa.engine.talk_to_data.utils import loc2coords, detect_location_with_openai, detectTable, nearestNeighbourSQL, detect_relevant_tables, replace_coordonates\n",
+    "\n",
+    "from climateqa.engine.llm import get_llm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Vanna Ask\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "llm = get_llm(provider=\"openai\")\n",
+    "\n",
+    "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n",
+    "PC_API_KEY = os.getenv('VANNA_PINECONE_API_KEY')\n",
+    "INDEX_NAME = os.getenv('VANNA_INDEX_NAME')\n",
+    "VANNA_MODEL = os.getenv('VANNA_MODEL')\n",
+    "\n",
+    "ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))\n",
+    "\n",
+    "#Vanna object\n",
+    "vn = MyVanna(config = {\"temperature\": 0, \"api_key\": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME, \"top_k\" : 4})\n",
+    "db_vanna_path = ROOT_PATH + \"/data/drias/drias.db\"\n",
+    "vn.connect_to_sqlite(db_vanna_path)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# User query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# query = \"Quelle sera la température à Marseille sur les prochaines années ?\"\n",
+    "query = \"Comment vont évoluer les températures à marseille ?\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Detect location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "location = detect_location_with_openai(OPENAI_API_KEY, query)\n",
+    "print(location)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Convert location to longitude, latitude coordonate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coords = loc2coords(location)\n",
+    "user_input = query.lower().replace(location.lower(), f\"lat, long : {coords}\")\n",
+    "print(user_input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Find closest coordonates and replace lat,lon\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "relevant_tables = detect_relevant_tables(user_input, llm)            \n",
+    "coords_tables = [nearestNeighbourSQL(db_vanna_path, coords, relevant_tables[i]) for i in range(len(relevant_tables))]\n",
+    "user_input_with_coords = replace_coordonates(coords, user_input, coords_tables)\n",
+    "print(user_input_with_coords)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ask Vanna with correct coordonates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sql_query, result_dataframe, figure = vn.ask(user_input_with_coords, print_results=False, allow_llm_to_see_data=True, auto_train=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "climateqa",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

style.css CHANGED Viewed

@@ -1,89 +1,127 @@
 /* :root {
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
-  } */
-#tab-recommended_content{
-    padding-top: 0px;
-    padding-left : 0px;
-    padding-right: 0px;
 }
 #group-subtabs {
     /* display: block; */
-    width: 100%; /* Ensures the parent uses the full width */
     position : sticky;
 }
-#group-subtabs .tab-container {
-    display: flex;
-    text-align: center;
-    width: 100%; /* Ensures the tabs span the full width */
-}
-#group-subtabs .tab-container button {
-    flex: 1; /* Makes each button take equal width */
 }
-#papers-summary-popup button span{
-    /* make label of accordio in bold, center, and bigger */
-    font-size: 16px;
     font-weight: bold;
-    text-align: center;
 }
-#papers-relevant-popup span{
-    /* make label of accordio in bold, center, and bigger */
-    font-size: 16px;
-    font-weight: bold;
-    text-align: center;
 }
-#tab-citations .button{
-    padding: 12px 16px;
-    font-size: 16px;
     font-weight: bold;
-    cursor: pointer;
-    border: none;
-    outline: none;
     text-align: left;
-    transition: background-color 0.3s ease;
 }
-.gradio-container {
-    width: 100%!important;
-    max-width: 100% !important;
 }
-/* fix for huggingface infinite growth*/
-main.flex.flex-1.flex-col {
-    max-height: 95vh !important;
 }
-button#show-figures{
-    /* Base styles */
-    background-color: #f5f5f5;
-    border: 1px solid #e0e0e0;
-    border-radius: 4px;
-    color: #333333;
-    cursor: pointer;
-    width: 100%;
-    text-align: center;
 }
-.avatar-container.svelte-1x5p6hu:not(.thumbnail-item) img {
-    width: 100%;
-    height: 100%;
-    object-fit: cover;
-    border-radius: 50%;
-    padding: 0px;
-    margin: 0px;
 }
 .warning-box {
     background-color: #fff3cd;
     border: 1px solid #ffeeba;
@@ -93,32 +131,20 @@ button#show-figures{
     color: #856404;
     display: inline-block;
     margin-bottom: 15px;
-  }
 .tip-box {
     background-color: #f0f9ff;
     border: 1px solid #80d4fa;
     border-radius: 4px;
-    margin-top:20px;
     padding: 15px 20px;
     font-size: 14px;
     display: inline-block;
-    margin-bottom: 15px;
     width: auto;
-    color:black !important;
-}
-body.dark .warning-box * {
-    color:black !important;
-}
-body.dark .tip-box * {
-    color:black !important;
 }
 .tip-box-title {
     font-weight: bold;
     font-size: 14px;
@@ -130,116 +156,128 @@ body.dark .tip-box * {
     margin-right: 5px;
 }
-.gr-box {border-color: #d6c37c}
-#hidden-message{
-    display:none;
 }
-.message{
-    font-size:14px !important;
-}
-.card-content img {
-    display: block;
-    margin: auto;
-    max-width: 100%; /* Ensures the image is responsive */
-    height: auto;
 }
-a {
-    text-decoration: none;
-    color: inherit;
 }
-.doc-ref sup{
-    color:#dc2626!important;
-    /* margin-right:1px; */
 }
-.card {
-    background-color: white;
-    border-radius: 10px;
-    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-    overflow: hidden;
-    display: flex;
-    flex-direction: column;
-    margin:20px;
 }
-.card-content {
-    padding: 20px;
 }
-.card-content h2 {
-    font-size: 14px !important;
-    font-weight: bold;
-    margin-bottom: 10px;
-    margin-top:0px !important;
-    color:#dc2626!important;;
 }
-.card-content p {
-    font-size: 12px;
-    margin-bottom: 0;
 }
-.card-footer {
-    background-color: #f4f4f4;
-    font-size: 10px;
     padding: 10px;
     display: flex;
-    justify-content: space-between;
     align-items: center;
 }
-.card-footer span {
-    flex-grow: 1;
-    text-align: left;
-    color: #999 !important;
 }
-.pdf-link {
-    display: inline-flex;
-    align-items: center;
-    margin-left: auto;
-    text-decoration: none!important;
-    font-size: 14px;
 }
-.message.user{
-    /* background-color:#7494b0 !important; */
-    border:none;
-    /* color:white!important; */
 }
-.message.bot{
-    /* background-color:#f2f2f7 !important; */
-    border:none;
 }
-label.selected{
-  background: #93c5fd !important;
 }
-#submit-button{
-    padding:0px !important;
 }
-#modal-config .block.modal-block.padded {
-    padding-top: 25px;
-    height: 100vh;
-}
-#modal-config .modal-container{
-    margin: 0px;
-    padding: 0px;
 }
-/* Modal styles */
 #modal-config {
     position: fixed;
     top: 0;
@@ -252,28 +290,23 @@ label.selected{
     padding: 15px;
     transform: none;
 }
-#modal-config .close{
-    display: none;
 }
-/* Push main content to the right when modal is open */
-/* .modal ~ * {
-    margin-left: 300px;
-    transition: margin-left 0.3s ease;
-} */
-#modal-config .modal .wrap ul{
-    position:static;
-    top: 100%;
-    left: 0;
-    /* min-height: 100px; */
-    height: 100%;
-    /* margin-top: 0; */
-    z-index: 9999;
-    pointer-events: auto;
-    height: 200px;
 }
-#config-button{
     background: none;
     border: none;
     padding: 8px;
@@ -296,155 +329,230 @@ label.selected{
     background-color: rgba(0, 0, 0, 0.1);
 }
-#checkbox-config{
-    display: block;
-    position: absolute;
-    background: none;
     border: none;
-    padding: 8px;
     cursor: pointer;
-    width: 40px;
-    height: 40px;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    border-radius: 50%;
-    transition: background-color 0.2s;
-    font-size: 20px;
     text-align: center;
 }
-#checkbox-config:checked{
-    display: block;
 }
-@media screen and (min-width: 1024px) {
-    /* Additional style for scrollable tab content */
-    /* div#tab-recommended_content {
-        overflow-y: auto;
-        max-height: 80vh;
-    } */
-    .gradio-container {
-        max-height: calc(100vh - 190px) !important;
-        overflow: hidden;
-    }
-    /* div#chatbot{
-        height:calc(100vh - 170px) !important;
-        max-height:calc(100vh - 170px) !important;
-    } */
-    div#tab-examples{
-        height:calc(100vh - 190px) !important;
-        overflow-y: scroll !important;
-        /* overflow-y: auto; */
-    }
-    div#sources-textbox{
-        height:calc(100vh - 190px) !important;
-        overflow-y: scroll !important;
-        /* overflow-y: auto !important; */
-    }
-    div#graphs-container{
-        height:calc(100vh - 210px) !important;
-        overflow-y: scroll !important;
-    }
-    div#sources-figures{
-        height:calc(100vh - 300px) !important;
-        max-height: 90vh !important;
-        overflow-y: scroll !important;
-    }
-    div#graphs-container{
-        height:calc(100vh - 300px) !important;
-        max-height: 90vh !important;
-        overflow-y: scroll !important;
-    }
-    div#tab-citations{
-        height:calc(100vh - 300px) !important;
-        max-height: 90vh !important;
         overflow-y: scroll !important;
     }
-    div#tab-config{
-        height:calc(100vh - 190px) !important;
         overflow-y: scroll !important;
-        /* overflow-y: auto !important; */
     }
-    /* Force container to respect height limits */
-    .main-component{
-        contain: size layout;
-        overflow: hidden;
     }
-    div#chatbot-row{
-        max-height:calc(100vh - 90px) !important;
     }
-/*
-    .max-height{
-        height:calc(100vh - 90px) !important;
-        max-height:calc(100vh - 90px) !important;
         overflow-y: auto;
     }
-*/
-}
-footer {
-    visibility: hidden;
-    display:none !important;
 }
 @media screen and (max-width: 767px) {
-    /* Your mobile-specific styles go here */
-    div#chatbot{
-        height:500px !important;
     }
-    #submit-button{
-        padding:0px !important;
         min-width: 80px;
     }
-    /* This will hide all list items */
     div.tab-nav button {
         display: none !important;
     }
-    /* This will show only the first list item */
-    div.tab-nav button:first-child {
-        display: block !important;
-    }
-    /* This will show only the first list item */
     div.tab-nav button:nth-child(2) {
         display: block !important;
     }
-    #right-panel button{
         display: block !important;
     }
-    /* ... add other mobile-specific styles ... */
 }
 @media (prefers-color-scheme: dark) {
-    .card{
         background-color: #374151;
     }
-    .card-image > .card-content{
         background-color: rgb(55, 65, 81) !important;
     }
@@ -452,251 +560,61 @@ footer {
         background-color: #404652;
     }
-    .container > .wrap{
         background-color: #374151 !important;
-        color:white !important;
     }
-    .card-content h2{
-        color:#e7754f !important;
-    }
-    .doc-ref sup{
-        color:rgb(235 109 35)!important;
-        /* margin-right:1px; */
     }
     .card-footer span {
-        color:white !important;
     }
-}
-.doc-ref{
-    color:#dc2626!important;
-    margin-right:1px;
-}
-.tabitem{
-    border:none !important;
-}
-.other-tabs > div{
-    padding-left:40px;
-    padding-right:40px;
-    padding-top:10px;
-}
-.gallery-item > div{
-    white-space: normal !important; /* Allow the text to wrap */
-    word-break: break-word !important; /* Break words to prevent overflow */
-    overflow-wrap: break-word !important; /* Break long words if necessary */
-  }
-span.chatbot > p > img{
-    margin-top:40px !important;
-    max-height: none !important;
-    max-width: 80% !important;
-    border-radius:0px !important;
-}
-.chatbot-caption{
-    font-size:11px;
-    font-style:italic;
-    color:#508094;
-}
-.ai-generated{
-    font-size:11px!important;
-    font-style:italic;
-    color:#73b8d4 !important;
-}
-.card-image > .card-content{
-    background-color:#f1f7fa;
-}
-.tab-nav > button.selected{
-    color:#4b8ec3;
-    font-weight:bold;
-    border:none;
-}
-.tab-nav{
-    border:none !important;
-}
-#input-textbox > label > textarea{
-    border-radius:40px;
-    padding-left:30px;
-    resize:none;
-}
-#input-message > div{
-    border:none;
-}
-#dropdown-samples{
-  background:none !important;
-}
-#dropdown-samples > .container > .wrap{
-  background-color:white;
-}
-#tab-examples > div > .form{
-  border:none;
-  background:none !important;
-}
-.a-doc-ref{
-	text-decoration: none !important;
 }
-.dropdown {
-    position: relative;
-    display:inline-block;
-    margin-bottom: 10px;
-  }
-  .dropdown-toggle {
-    background-color: #f2f2f2;
-    color: black;
-    padding: 10px;
-    font-size: 16px;
-    cursor: pointer;
-    display: block;
-    width: 400px; /* Adjust width as needed */
-    position: relative;
-    display: flex;
-    align-items: center; /* Vertically center the contents */
-    justify-content: left;
-  }
-  .dropdown-toggle .caret {
-    content: "";
-    position: absolute;
-    right: 10px;
-    top: 50%;
-    border-left: 5px solid transparent;
-    border-right: 5px solid transparent;
-    border-top: 5px solid black;
-    transform: translateY(-50%);
-  }
-  input[type="checkbox"] {
-    display: none !important;
-  }
-  input[type="checkbox"]:checked + .dropdown-content {
     display: block;
-  }
-  #checkbox-chat input[type="checkbox"] {
-    display: flex !important;
-  }
-  .dropdown-content {
-    display: none;
     position: absolute;
-    background-color: #f9f9f9;
-    min-width: 300px;
-    box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
-    z-index: 1;
-    padding: 12px;
-    border: 1px solid #ccc;
-  }
-  input[type="checkbox"]:checked + .dropdown-toggle + .dropdown-content {
-    display: block;
-  }
-  input[type="checkbox"]:checked + .dropdown-toggle .caret {
-    border-top: 0;
-    border-bottom: 5px solid black;
-  }
-.loader {
-    border: 1px solid #d0d0d0 !important; /* Light grey background */
-    border-top: 1px solid #db3434 !important; /* Blue color */
-    border-right: 1px solid #3498db !important; /* Blue color */
     border-radius: 50%;
-    width: 20px;
-    height: 20px;
-    animation: spin 2s linear infinite;
-    display:inline-block;
-    margin-right:10px !important;
-}
-.checkmark{
-    color:green !important;
-    font-size:18px;
-    margin-right:10px !important;
-}
-@keyframes spin {
-    0% { transform: rotate(0deg); }
-    100% { transform: rotate(360deg); }
-}
-.relevancy-score{
-    margin-top:10px !important;
-    font-size:10px !important;
-    font-style:italic;
-}
-.score-green{
-    color:green !important;
-}
-.score-orange{
-    color:orange !important;
-}
-.score-red{
-    color:red !important;
-}
-/* Mobile specific adjustments */
-@media screen and (max-width: 767px) {
-    div#tab-recommended_content {
-        max-height: 50vh; /* Reduce height for smaller screens */
-        overflow-y: auto;
-    }
 }
-/* Additional style for scrollable tab content */
-div#tab-saved-graphs {
-    overflow-y: auto; /* Enable vertical scrolling */
-    max-height: 80vh; /* Adjust height as needed */
 }
-/* Mobile specific adjustments */
-@media screen and (max-width: 767px) {
-    div#tab-saved-graphs {
-        max-height: 50vh; /* Reduce height for smaller screens */
-        overflow-y: auto;
-    }
 }
-.message-buttons-left.panel.message-buttons.with-avatar {
-    display: none;
 }
-/* Specific fixes for Hugging Face Space iframe */
-.h-full {
-    height: auto !important;
-    min-height: 0 !important;
-}
-.space-content {
-    height: auto !important;
-    max-height: 100vh !important;
-    overflow: hidden;
 }

+/* Root Variables */
 /* :root {
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
+} */
+/* Layout & Container Styles */
+.gradio-container {
+    width: 100% !important;
+    max-width: 100% !important;
+}
+main.flex.flex-1.flex-col {
+    max-height: 95vh !important;
+}
+.main-component {
+    contain: size layout;
+    overflow: hidden;
+}
+/* Tab Styles */
+#tab-recommended_content {
+    padding: 0;
 }
 #group-subtabs {
     /* display: block; */
     position : sticky;
 }
 }
+.tab-nav {
+    border: none !important;
+}
+.tab-nav > button.selected {
+    color: #4b8ec3;
     font-weight: bold;
+    border: none;
+}
+.tabitem {
+    border: none !important;
 }
+.other-tabs > div {
+    padding: 40px 40px 10px;
 }
+/* Card Styles */
+.card {
+    background-color: white;
+    border-radius: 10px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    overflow: hidden;
+    display: flex;
+    flex-direction: column;
+    margin: 20px;
+}
+.card-content {
+    padding: 20px;
+}
+.card-content h2 {
+    font-size: 14px !important;
     font-weight: bold;
+    margin: 0 0 10px !important;
+    color: #dc2626 !important;
+}
+.card-content p {
+    font-size: 12px;
+    margin-bottom: 0;
+}
+.card-content img {
+    display: block;
+    margin: auto;
+    max-width: 100%;
+    height: auto;
+}
+.card-footer {
+    background-color: #f4f4f4;
+    font-size: 10px;
+    padding: 10px;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.card-footer span {
+    flex-grow: 1;
     text-align: left;
+    color: #999 !important;
 }
+.card-image > .card-content {
+    background-color: #f1f7fa;
+}
+/* Message & Chat Styles */
+.message {
+    font-size: 14px !important;
 }
+.message.user, .message.bot {
+    border: none;
 }
+#input-textbox > label > textarea {
+    border-radius: 40px;
+    padding-left: 30px;
+    resize: none;
 }
+#input-message > div {
+    border: none;
 }
+/* Alert Boxes */
 .warning-box {
     background-color: #fff3cd;
     border: 1px solid #ffeeba;
     color: #856404;
     display: inline-block;
     margin-bottom: 15px;
+}
 .tip-box {
     background-color: #f0f9ff;
     border: 1px solid #80d4fa;
     border-radius: 4px;
+    margin: 20px 0 15px;
     padding: 15px 20px;
     font-size: 14px;
     display: inline-block;
     width: auto;
+    color: black !important;
 }
 .tip-box-title {
     font-weight: bold;
     font-size: 14px;
     margin-right: 5px;
 }
+/* Loader Animation */
+.loader {
+    border: 1px solid #d0d0d0 !important;
+    border-top: 1px solid #db3434 !important;
+    border-right: 1px solid #3498db !important;
+    border-radius: 50%;
+    width: 20px;
+    height: 20px;
+    animation: spin 2s linear infinite;
+    display: inline-block;
+    margin-right: 10px !important;
 }
+@keyframes spin {
+    0% { transform: rotate(0deg); }
+    100% { transform: rotate(360deg); }
 }
+/* PDF Link Styles */
+.pdf-link {
+    display: inline-flex;
+    align-items: center;
+    margin-left: auto;
+    text-decoration: none!important;
+    font-size: 14px;
 }
+/* Document Reference Styles */
+.doc-ref sup {
+    color: #dc2626!important;
 }
+.doc-ref {
+    color: #dc2626!important;
+    margin-right: 1px;
+}
+/* Chatbot & Image Styles */
+span.chatbot > p > img {
+    margin-top: 40px !important;
+    max-height: none !important;
+    max-width: 80% !important;
+    border-radius: 0px !important;
 }
+.chatbot-caption {
+    font-size: 11px;
+    font-style: italic;
+    color: #508094;
 }
+.ai-generated {
+    font-size: 11px!important;
+    font-style: italic;
+    color: #73b8d4 !important;
 }
+/* Dropdown Styles */
+.dropdown {
+    position: relative;
+    display: inline-block;
+    margin-bottom: 10px;
 }
+.dropdown-toggle {
+    background-color: #f2f2f2;
+    color: black;
     padding: 10px;
+    font-size: 16px;
+    cursor: pointer;
     display: flex;
+    width: 400px;
     align-items: center;
+    justify-content: left;
+    position: relative;
 }
+.dropdown-toggle .caret {
+    content: "";
+    position: absolute;
+    right: 10px;
+    top: 50%;
+    border-left: 5px solid transparent;
+    border-right: 5px solid transparent;
+    border-top: 5px solid black;
+    transform: translateY(-50%);
 }
+.dropdown-content {
+    display: none;
+    position: absolute;
+    background-color: #f9f9f9;
+    min-width: 300px;
+    box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
+    z-index: 1;
+    padding: 12px;
+    border: 1px solid #ccc;
 }
+/* Checkbox Styles */
+input[type="checkbox"] {
+    display: none !important;
 }
+#checkbox-chat input[type="checkbox"] {
+    display: flex !important;
 }
+input[type="checkbox"]:checked + .dropdown-content {
+    display: block;
 }
+input[type="checkbox"]:checked + .dropdown-toggle + .dropdown-content {
+    display: block;
 }
+input[type="checkbox"]:checked + .dropdown-toggle .caret {
+    border-top: 0;
+    border-bottom: 5px solid black;
 }
+/* Modal Styles */
 #modal-config {
     position: fixed;
     top: 0;
     padding: 15px;
     transform: none;
 }
+#modal-config .block.modal-block.padded {
+    padding-top: 25px;
+    height: 100vh;
 }
+#modal-config .modal-container {
+    margin: 0px;
+    padding: 0px;
+}
+#modal-config .close {
+    display: none;
 }
+/* Config Button Styles */
+#config-button {
     background: none;
     border: none;
     padding: 8px;
     background-color: rgba(0, 0, 0, 0.1);
 }
+/* Relevancy Score Styles */
+.relevancy-score {
+    margin-top: 10px !important;
+    font-size: 10px !important;
+    font-style: italic;
+}
+.score-green {
+    color: green !important;
+}
+.score-orange {
+    color: orange !important;
+}
+.score-red {
+    color: red !important;
+}
+/* Gallery Styles */
+.gallery-item > div {
+    white-space: normal !important;
+    word-break: break-word !important;
+    overflow-wrap: break-word !important;
+}
+/* Avatar Styles */
+.avatar-container.svelte-1x5p6hu:not(.thumbnail-item) img {
+    width: 100%;
+    height: 100%;
+    object-fit: cover;
+    border-radius: 50%;
+    padding: 0px;
+    margin: 0px;
+}
+/* Message Button Styles */
+.message-buttons-left.panel.message-buttons.with-avatar {
+    display: none;
+}
+/* Checkmark Styles */
+.checkmark {
+    color: green !important;
+    font-size: 18px;
+    margin-right: 10px !important;
+}
+/* Papers Summary & Relevant Popup Styles */
+#papers-summary-popup button span,
+#papers-relevant-popup span {
+    font-size: 16px;
+    font-weight: bold;
+    text-align: center;
+}
+/* Citations Tab Button Style */
+#tab-citations .button {
+    padding: 12px 16px;
+    font-size: 16px;
+    font-weight: bold;
+    cursor: pointer;
     border: none;
+    outline: none;
+    text-align: left;
+    transition: background-color 0.3s ease;
+}
+/* Show Figures Button Style */
+button#show-figures {
+    background-color: #f5f5f5;
+    border: 1px solid #e0e0e0;
+    border-radius: 4px;
+    color: #333333;
     cursor: pointer;
+    width: 100%;
     text-align: center;
 }
+/* Gradio Box Style */
+.gr-box {
+    border-color: #d6c37c;
 }
+/* Hidden Message Style */
+#hidden-message {
+    display: none;
+}
+/* Label Selected Style */
+label.selected {
+    background: #93c5fd !important;
+}
+/* Submit Button Style */
+#submit-button {
+    padding: 0px !important;
+}
+/* Hugging Face Space Fixes */
+.h-full {
+    height: auto !important;
+    min-height: 0 !important;
+}
+.space-content {
+    height: auto !important;
+    max-height: 100vh !important;
+    overflow: hidden;
+}
+/* Dropdown Samples Style */
+#dropdown-samples {
+    background: none !important;
+}
+#dropdown-samples > .container > .wrap {
+    background-color: white;
+}
+/* Tab Examples Form Style */
+#tab-examples > div > .form {
+    border: none;
+    background: none !important;
+}
+/* Utility Classes */
+.hidden {
+    display: none !important;
+}
+footer {
+    display: none !important;
+    visibility: hidden;
+}
+a {
+    text-decoration: none;
+    color: inherit;
+}
+.a-doc-ref {
+    text-decoration: none !important;
+}
+/* Media Queries */
+/* Desktop Media Query */
+@media screen and (min-width: 1024px) {
+    .gradio-container {
+        max-height: calc(100vh - 190px) !important;
+        overflow: hidden;
+    }
+    div#tab-examples,
+    div#sources-textbox,
+    div#tab-config {
+        height: calc(100vh - 190px) !important;
         overflow-y: scroll !important;
     }
+    div#tab-vanna,
+    div#sources-figures,
+    div#graphs-container,
+    div#tab-citations {
+        height: calc(100vh - 300px) !important;
+        max-height: 90vh !important;
         overflow-y: scroll !important;
     }
+    div#chatbot-row {
+        max-height: calc(100vh - 90px) !important;
     }
+    div#graphs-container {
+        height: calc(100vh - 210px) !important;
+        overflow-y: scroll !important;
     }
+    div#tab-saved-graphs {
         overflow-y: auto;
+        max-height: 80vh;
     }
 }
+/* Mobile Media Query */
 @media screen and (max-width: 767px) {
+    div#chatbot {
+        height: 500px !important;
     }
+    #submit-button {
+        padding: 0 !important;
         min-width: 80px;
     }
     div.tab-nav button {
         display: none !important;
     }
+    div.tab-nav button:first-child,
     div.tab-nav button:nth-child(2) {
         display: block !important;
     }
+    #right-panel button {
         display: block !important;
     }
+    div#tab-recommended_content {
+        max-height: 50vh;
+        overflow-y: auto;
+    }
+    div#tab-saved-graphs {
+        max-height: 50vh;
+        overflow-y: auto;
+    }
 }
+/* Dark Mode */
 @media (prefers-color-scheme: dark) {
+    .card {
         background-color: #374151;
     }
+    .card-image > .card-content {
         background-color: rgb(55, 65, 81) !important;
     }
         background-color: #404652;
     }
+    .container > .wrap {
         background-color: #374151 !important;
+        color: white !important;
     }
+    .card-content h2 {
+        color: #e7754f !important;
     }
     .card-footer span {
+        color: white !important;
     }
+    body.dark .warning-box *,
+    body.dark .tip-box * {
+        color: black !important;
+    }
+    .doc-ref sup {
+        color: rgb(235 109 35)!important;
+    }
 }
+/* Checkbox Config Style */
+#checkbox-config {
     display: block;
     position: absolute;
+    background: none;
+    border: none;
+    padding: 8px;
+    cursor: pointer;
+    width: 40px;
+    height: 40px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
     border-radius: 50%;
+    transition: background-color 0.2s;
+    font-size: 20px;
+    text-align: center;
 }
+#checkbox-config:checked {
+    display: block;
 }
+#vanna-display {
+    max-height: 300px;
+    /* overflow-y: scroll; */
 }
+#sql-query{
+    max-height: 100px;
+    overflow-y:scroll;
 }
+#vanna-details{
+    max-height: 500px;
+    overflow-y:scroll;
 }