{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np\n", "import os\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import sys\n", "sys.path.append(os.path.dirname(os.getcwd()))\n", "\n", "from dotenv import load_dotenv\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.llm import get_llm\n", "from climateqa.engine.vectorstore import get_pinecone_vectorstore\n", "from climateqa.engine.embeddings import get_embeddings_function\n", "from climateqa.engine.reranker import get_reranker\n", "from climateqa.engine.graph import make_graph_agent, display_graph\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## LLM" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.llm import get_llm\n", "\n", "llm = get_llm(provider=\"openai\")\n", "llm.invoke(\"Say Hello !\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Retriever " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.vectorstore import get_pinecone_vectorstore\n", "from climateqa.engine.embeddings import get_embeddings_function\n", "\n", "question = \"What is the impact of climate change on the environment?\"\n", "\n", "embeddings_function = get_embeddings_function()\n", "vectorstore_ipcc = get_pinecone_vectorstore(embeddings_function)\n", "docs_question = vectorstore_ipcc.search(query = question, search_type=\"similarity\")\n", "docs_question" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# optional filters\n", "sources_owid = [\"OWID\"]\n", "filters = {}\n", "filters[\"source\"] = {\"$in\": sources_owid}\n", "\n", "# vectorestore_graphs\n", "vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv(\"PINECONE_API_INDEX_OWID\"), text_key=\"title\")\n", "owid_graphs = vectorstore_graphs.search(query = question, search_type=\"similarity\")\n", "owid_graphs = vectorstore_graphs.similarity_search_with_score(query = question, filter=filters, k=5)\n", "owid_graphs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv(\"PINECONE_API_INDEX_REGION\"))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reranker" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.reranker import get_reranker\n", "from climateqa.engine.reranker import rerank_docs\n", "\n", "reranker = get_reranker(\"nano\")\n", "reranked_docs_question = rerank_docs(reranker,docs_question,question)\n", "reranked_docs_question" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.graph import make_graph_agent, display_graph, make_graph_agent_poc\n", "\n", "llm = get_llm(provider=\"openai\")\n", "embeddings_function = get_embeddings_function()\n", "vectorstore_ipcc = get_pinecone_vectorstore(embeddings_function)\n", "vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv(\"PINECONE_API_INDEX_OWID\"), text_key=\"title\")\n", "vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv(\"PINECONE_API_INDEX_REGION\"))\n", "reranker = get_reranker(\"nano\")\n", "\n", "app = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore_ipcc, vectorstore_graphs=vectorstore_graphs, vectorstore_region=vectorstore_region, reranker=reranker)\n", "app2 = make_graph_agent_poc(llm=llm, vectorstore_ipcc=vectorstore_ipcc, vectorstore_graphs=vectorstore_graphs, vectorstore_region=vectorstore_region, reranker=reranker)\n", "display_graph(app)\n", "display_graph(app2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.graph import search \n", "\n", "from climateqa.engine.chains.intent_categorization import make_intent_categorization_node\n", "\n", "\n", "from climateqa.engine.chains.answer_chitchat import make_chitchat_node\n", "from climateqa.engine.chains.answer_ai_impact import make_ai_impact_node\n", "from climateqa.engine.chains.query_transformation import make_query_transform_node\n", "from climateqa.engine.chains.translation import make_translation_node\n", "from climateqa.engine.chains.retrieve_documents import make_IPx_retriever_node, make_POC_retriever_node\n", "from climateqa.engine.chains.answer_rag import make_rag_node\n", "from climateqa.engine.chains.graph_retriever import make_graph_retriever_node\n", "from climateqa.engine.chains.chitchat_categorization import make_chitchat_intent_categorization_node\n", "from climateqa.engine.chains.prompts import audience_prompts\n", "from climateqa.engine.graph import route_intent\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inial_state = {\n", " \"user_input\": \"What is the impact of climate change on the environment?\", \n", " # \"user_input\": \"Quel est l'impact du changement climatique sur Bordeaux ?\",\n", " \"audience\" : audience_prompts[\"general\"],\n", " # \"sources_input\":[\"IPCC\"],\n", " \"relevant_content_sources_selection\": [\"Figures (IPCC/IPBES)\",\"POC region\"],\n", " \"search_only\" : False,\n", " \"reports\": [],\n", "}\n", "state=inial_state.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_node = make_intent_categorization_node(llm)\n", "state.update(cat_node(inial_state))\n", "state" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# state.update(search(state))\n", "# state" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "intent = route_intent(state)\n", "\n", "if route_intent(state) == \"translate_query\":\n", " make_translation_node(llm)(state)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "state.update(make_query_transform_node(llm)(state))\n", "state" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.chains.retrieve_documents import retrieve_documents\n", "res = await retrieve_documents(state[\"questions_list\"][0],{},\"IPx\", vectorstore_ipcc,reranker)\n", "res" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from climateqa.engine.chains.retrieve_documents import retrieve_documents_for_all_questions\n", "\n", "source_type = \"IPx\"\n", "to_handle_questions_index = [i for i, x in enumerate(state[\"questions_list\"]) if x[\"source_type\"] == \"IPx\"]\n", "\n", "search_figures = \"Figures (IPCC/IPBES)\" in state[\"relevant_content_sources_selection\"]\n", "search_only = state[\"search_only\"]\n", "reports = state[\"reports\"]\n", "questions_list = state[\"questions_list\"]\n", "n_questions=state[\"n_questions\"][\"total\"]\n", "k_final=15\n", "k_before_reranking=100\n", "\n", "res = await retrieve_documents_for_all_questions(\n", " search_figures=search_figures,\n", " search_only=search_only,\n", " reports=reports,\n", " questions_list=questions_list,\n", " n_questions=n_questions,\n", " config={},\n", " source_type=source_type,\n", " to_handle_questions_index=to_handle_questions_index,\n", " vectorstore=vectorstore_ipcc,\n", " reranker=reranker,\n", " rerank_by_question=True,\n", " k_final=k_final,\n", " k_before_reranking=k_before_reranking,\n", " )\n", "state.update(res)\n", "state" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "state.update(await make_graph_retriever_node(vectorstore_graphs, reranker)(state))\n", "state" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "answer_rag = await make_rag_node(llm)(state,{})\n", "state.update(answer_rag)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# stream event of the whole chain" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "from climateqa.engine.graph import make_graph_agent, display_graph\n", "from climateqa.engine.chains.prompts import audience_prompts\n", "\n", "\n", "inial_state = { \n", " \"user_input\": \"Comment le changement climatique m'affectera à Paris?\",\n", " \"audience\" : audience_prompts[\"general\"],\n", " \"sources_input\":[\"IPCC\"],\n", " \"relevant_content_sources_selection\": [\"Figures (IPCC/IPBES)\",\"POC region\"],\n", " \"search_only\" : False,\n", " \"reports\": [],\n", "}\n", "app = make_graph_agent_poc(llm=llm, vectorstore_ipcc=vectorstore_ipcc, vectorstore_graphs=vectorstore_graphs, vectorstore_region=vectorstore_region, reranker=reranker)\n", "\n", "inial_state" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "event_list = app.astream_events(inial_state, version = \"v1\")\n", "static_event_list = []\n", "async for event in event_list:\n", " static_event_list.append(event)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_events = pd.DataFrame(static_event_list)\n", "df_events[\"node\"] = df_events[\"metadata\"].apply(lambda x: x[\"langgraph_node\"] if \"langgraph_node\" in x else \"None\")\n", "# df_events_chat = df_events[(df_events[\"event\"] ==\"on_chat_model_stream\")]\n", "df_events_chat = df_events[df_events[\"node\"].apply(lambda x : x in [\"answer_rag\",\"answer_rag_no_docs\", \"answer_search\", \"answer_chitchat\"])]\n", "node_end_answer = df_events_chat[(df_events_chat[\"event\"] ==\"on_chain_end\") & (df_events_chat[\"name\"] ==\"answer_rag\")]\n", "node_end_answer[\"data\"].values[0][\"output\"]\n", "# df_events_chat[\"data\"].apply(lambda x: x[\"content\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ev_rel_doc = df_events.iloc[df_events[(df_events[\"event\"] ==\"on_chain_end\")][\"data\"].apply(lambda x: x[\"output\"]).dropna().apply(lambda x: x[\"related_contents\"] if \"related_contents\" in x else None).dropna().index].iloc[-1]\n", "related_content = ev_rel_doc[\"data\"][\"output\"][\"related_contents\"]\n", "# [f\"{d.metadata['short_name']} - {d.metadata['name']}\" for d in related_content]\n", "related_content[0].metadata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_events[(df_events[\"event\"] ==\"on_chain_end\") & (df_events[\"name\"]==\"transform_query\")][\"data\"].values[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_events[(df_events[\"event\"] ==\"on_chain_end\") & (df_events[\"name\"]==\"answer_search\")][\"data\"].values[0][\"input\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "node_end_answer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the answer at the end\n", "from climateqa.handle_stream_events import stream_answer\n", "event_list = app.astream_events(inial_state, version = \"v1\")\n", "history = []\n", "start_streaming = False\n", "answer_message_content = \"\"\n", "async for event in event_list:\n", "\n", " if \"langgraph_node\" in event[\"metadata\"]:\n", " node = event[\"metadata\"][\"langgraph_node\"]\n", "\n", " if (event[\"name\"] != \"transform_query\" and \n", " event[\"event\"] == \"on_chat_model_stream\" and\n", " node in [\"answer_rag\",\"answer_rag_no_docs\", \"answer_search\", \"answer_chitchat\"]):\n", " history, start_streaming, answer_message_content = stream_answer(\n", " history, event, start_streaming, answer_message_content\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test events logs\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inial_state = {'user_input': 'What is the impact of climate in Bordeaux',\n", " 'audience': 'the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.',\n", " 'sources_input': ['IPCC'],\n", " 'relevant_content_sources_selection': ['Figures (IPCC/IPBES)', 'POC region'],\n", " 'search_only': False,\n", " 'reports': []\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the answer at the end\n", "from climateqa.handle_stream_events import stream_answer\n", "app = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore_ipcc, vectorstore_graphs=vectorstore_graphs, vectorstore_region=vectorstore_region, reranker=reranker)\n", "\n", "event_list = app.astream_events(inial_state, version = \"v1\")\n", "history = []\n", "start_streaming = False\n", "answer_message_content = \"\"\n", "static_event_list = []\n", "async for event in event_list:\n", " static_event_list.append(event)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_static_events = pd.DataFrame(static_event_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_static_events.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_static_events[\"name\"].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_events = df_static_events[\n", " (df_static_events[\"event\"] == \"on_chain_end\") &\n", " (df_static_events[\"name\"].isin([\"retrieve_documents\", \"retrieve_local_data\", \"retrieve_POC_docs_node\",\"retrieve_IPx_docs\"]))\n", " # (df_static_events[\"data\"].apply(lambda x: x[\"output\"] is not None))\n", "]\n", "selected_events" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# selected_events[selected_events[\"data\"].apply(lambda x : \"output\" in x and x[\"output\"] is not None)]\n", "selected_events[\"data\"].apply(lambda x : x[\"output\"][\"documents\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_events = df_static_events[\n", " (df_static_events[\"event\"] == \"on_chain_end\") &\n", " (df_static_events[\"name\"].isin([\"answer_search\"]))\n", " # (df_static_events[\"data\"].apply(lambda x: x[\"output\"] is not None))\n", "]\n", "selected_events[\"metadata\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_events[\"data\"].iloc[0][\"input\"][\"related_contents\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_events[\"data\"].apply(lambda x : x[\"output\"]).iloc[2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_events.iloc[0][\"data\"].values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_events.iloc[1][\"data\"].values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list(selected_events.iloc[0][\"data\"].values())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list(selected_events.iloc[1][\"data\"].values())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list(selected_events.iloc[2][\"data\"].values())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list(selected_events.iloc[3][\"data\"].values())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import json\n", "\n", "# print(json.dumps(list(selected_events.iloc[1][\"data\"].values()), indent=4))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "data_values = selected_events.iloc[1][\"data\"].values()\n", "formatted_data = json.dumps(list(data_values)[0], indent=4)\n", "print(formatted_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pprint import pprint\n", "import json\n", "selected_events.iloc[2][\"data\"].values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selected_events.iloc[3][\"data\"].values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_static_events[df_static_events[\"name\"] == \"retrieve_POC_docs_node\"].iloc[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "climateqa", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }