Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on 8 days ago

Commit

bbfd1ce

1 Parent(s): 7287f1d

wip

Browse files

Files changed (2) hide show

climateqa/engine/talk_to_data/main.py +32 -48
climateqa/engine/talk_to_data/test_vanna.ipynb +174 -0

climateqa/engine/talk_to_data/main.py CHANGED Viewed

@@ -24,29 +24,7 @@ vn.connect_to_sqlite(db_vanna_path)
 llm = get_llm(provider="openai")
-# def ask_vanna(query):
-#     location = detect_location_with_openai(OPENAI_API_KEY, query)
-#     if location:
-#         coords = loc2coords(location)
-#         user_input = query.replace(location, f"lat, long : {coords}")
-#         answer = vn.ask(user_input, print_results=False, allow_llm_to_see_data=True)
-#         table = detectTable(answer[0])
-#         coords2 = nearestNeighbourSQL(db_vanna_path, coords, table[0])
-#         query  = answer[0].replace(f"{coords[0]}", f"{coords2[0]}")
-#         sql_query = query.replace(f"{coords[1]}", f"{coords2[1]}")
-#         db = sqlite3.connect(db_vanna_path)
-#         result = db.cursor().execute(sql_query).fetchall()
-#         print(result)
-#         df = pd.DataFrame(result, columns=answer[1].columns)
-#     else:
-#         answer = vn.ask(query, visualize=True, print_results=False, allow_llm_to_see_data=True)
-#         sql_query = answer[0]
-#         df = answer[1]
-#     return (sql_query, df)
 def replace_coordonates(coords, sql_query, coords_tables):
     n = sql_query.count(str(coords[0]))
     sql_query_new_coords = sql_query
@@ -57,34 +35,40 @@ def replace_coordonates(coords, sql_query, coords_tables):
     return sql_query_new_coords
 def ask_vanna(query):
-    location = detect_location_with_openai(OPENAI_API_KEY, query)
-    if location:
-        coords = loc2coords(location)
-        user_input = query.replace(location, f"lat, long : {coords}")
-        sql_query, result_dataframe, figure = vn.ask(user_input, print_results=False, allow_llm_to_see_data=True)
-        table = detectTable(sql_query)
-        coords_tables = [nearestNeighbourSQL(db_vanna_path, coords, table[i]) for i in range(len(table))]
-        sql_query_new_coords = replace_coordonates(coords, sql_query, coords_tables)
-        sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query_new_coords}. Just answer the query. The answer should not include ```sql\n").content
-        print("execute sql query : ", sql_with_table_names)
-        db = sqlite3.connect(db_vanna_path)
-        result = db.cursor().execute(sql_query_new_coords).fetchall()
-        columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query_new_coords}").content
-        columns_list = ast.literal_eval(columns.strip("```python\n").strip())
-        print("column list : ",columns_list)
-        df = pd.DataFrame(result, columns=columns_list)
-        plotly_code = vn.generate_plotly_code(
-                            question="query",
-                            sql="sql_with_table_names",
-                            df_metadata=f"Running df.dtypes gives:\n {df.dtypes}",
-                        )
-        fig = vn.get_plotly_figure(plotly_code=plotly_code, df=df)
-        return df, fig
-    else :
         empty_df = pd.DataFrame()
         empty_fig = {}
-        return empty_df, empty_fig

 llm = get_llm(provider="openai")
 def replace_coordonates(coords, sql_query, coords_tables):
     n = sql_query.count(str(coords[0]))
     sql_query_new_coords = sql_query
     return sql_query_new_coords
 def ask_vanna(query):
+    try :
+        location = detect_location_with_openai(OPENAI_API_KEY, query)
+        if location:
+            coords = loc2coords(location)
+            user_input = query.replace(location, f"lat, long : {coords}")
+            sql_query, result_dataframe, figure = vn.ask(user_input, print_results=False, allow_llm_to_see_data=True)
+            table = detectTable(sql_query)
+            coords_tables = [nearestNeighbourSQL(db_vanna_path, coords, table[i]) for i in range(len(table))]
+            sql_query_new_coords = replace_coordonates(coords, sql_query, coords_tables)
+            sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query_new_coords}. Just answer the query. The answer should not include ```sql\n").content
+            print("execute sql query : ", sql_with_table_names)
+            db = sqlite3.connect(db_vanna_path)
+            result = db.cursor().execute(sql_query_new_coords).fetchall()
+            columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query_new_coords}").content
+            columns_list = ast.literal_eval(columns.strip("```python\n").strip())
+            print("column list : ",columns_list)
+            df = pd.DataFrame(result, columns=columns_list)
+            plotly_code = vn.generate_plotly_code(
+                                question="query",
+                                sql="sql_with_table_names",
+                                df_metadata=f"Running df.dtypes gives:\n {df.dtypes}",
+                            )
+            fig = vn.get_plotly_figure(plotly_code=plotly_code, df=df)
+            return df, fig
+        else :
+            empty_df = pd.DataFrame()
+            empty_fig = {}
+            return empty_df, empty_fig
+    except Exception as e:
+        print(f"Error: {e}")
         empty_df = pd.DataFrame()
         empty_fig = {}
+        return empty_df, empty_fig

climateqa/engine/talk_to_data/test_vanna.ipynb ADDED Viewed

	@@ -0,0 +1,174 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from main import ask_vanna\n",
+    "import sqlite3\n",
+    "import os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "table_names_list = [\n",
+    "    \"Frequency_of_rainy_days_index\",\n",
+    "    \"Winter_precipitation_total\",\n",
+    "    \"Summer_precipitation_total\",\n",
+    "    \"Annual_precipitation_total\",\n",
+    "    \"Remarkable_daily_precipitation_total_(Q99)\",\n",
+    "    \"Frequency_of_remarkable_daily_precipitation\",\n",
+    "    \"Extreme_precipitation_intensity\",\n",
+    "    \"Mean_winter_temperature\",\n",
+    "    \"Mean_summer_temperature\",\n",
+    "    \"Number_of_tropical_nights\",\n",
+    "    \"Maximum_summer_temperature\",\n",
+    "    \"Number_of_days_with_Tx_above_30C\",\n",
+    "    \"Number_of_days_with_Tx_above_35C\",\n",
+    "    \"Drought_index\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from climateqa.engine.llm import get_llm\n",
+    "\n",
+    "llm = get_llm(provider=\"openai\")\n",
+    "user_question = \"Quel sera la température à Marseille dans les prochaines années ?\"\n",
+    "prompt = f\"You are helping to build a sql query to retrieve relevant data for a user question. The different tables are {table_names_list}. The user question is {user_question}. Write the relevant table to query. Answer only the table name.\"\n",
+    "table_name = llm.invoke(prompt).content\n",
+    "# llm.invoke(f\"Make the following sql query display the source table in the rows {sql_query_new_coords}. Just answer the query. The answer should not include ```sql\\n\").content\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = {\"Mean_summer_temperature\": {\n",
+    "            \"description\": (\n",
+    "                \"The Mean summer temperature table contains information on the average summer temperature in the past and the future. \"\n",
+    "                \"The variables are as follows:\\n\"\n",
+    "                \"- 'y' and 'x': Lambert Paris II coordinates for the location.\\n\"\n",
+    "                \"- year: Year of the observation.\\n\"\n",
+    "                \"- month : Month of the observation.\\n\"\n",
+    "                \"- day: Day of the observation.\\n\"\n",
+    "                \"- 'LambertParisII': Indicates that the x and y coordinates are in Lambert Paris II projection.\\n\"\n",
+    "                \"- 'lat' and 'lon': Latitude and longitude of the location.\\n\"\n",
+    "                \"- 'TMm': Average summer temperature.\\n\"\n",
+    "            ),\n",
+    "            \"sql_query\": \"\"\"\n",
+    "                CREATE TABLE Mean_summer_temperature (\n",
+    "                    y FLOAT,\n",
+    "                    x FLOAT,\n",
+    "                    year INT,\n",
+    "                    month INT, \n",
+    "                    day INT,\n",
+    "                    LambertParisII VARCHAR(255),\n",
+    "                    lat FLOAT,\n",
+    "                    lon FLOAT,\n",
+    "                    TMm FLOAT, -- Température moyenne en été\n",
+    "                );\n",
+    "            \"\"\"}\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from climateqa.engine.talk_to_data.utils import loc2coords\n",
+    "location = \"Marseille\"\n",
+    "coords = loc2coords(location)\n",
+    "user_input = user_question.replace(location, f\"lat, long : {coords}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "initial_prompt = f\"You are a mysql expert. \" + \\\n",
+    "                \"Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \"\n",
+    "initial_prompt += f\"\\n===Tables \\n + {docs[table_name]['sql_query']}\"\n",
+    "initial_prompt += f\"\\n===Additional Context \\n\\n {docs[table_name]['description']}\"\n",
+    "initial_prompt += (\n",
+    "                \"===Response Guidelines \\n\"\n",
+    "                \"1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \\n\"\n",
+    "                \"2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \\n\"\n",
+    "                \"3. If the provided context is insufficient, please give a sql query based on your knowledge and the context provided. \\n\"\n",
+    "                \"4. Please use the most relevant table(s). \\n\"\n",
+    "                \"5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \\n\"\n",
+    "                f\"6. Ensure that the output SQL is mysql-compliant and executable, and free of syntax errors. \\n\"\n",
+    "            )\n",
+    "initial_prompt += f\"\\n===Question \\n\\n {user_input}\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sql_query = llm.invoke(initial_prompt).content\n",
+    "sql_query"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Vanna ask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from climateqa.engine.llm import get_llm\n",
+    "import ast\n",
+    "\n",
+    "llm = get_llm(provider=\"openai\")\n",
+    "columns = llm.invoke(f\"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query SELECT 'Mean_winter_temperature' AS source_table, year, month, day, TMm FROM Mean_winter_temperature WHERE lat = 43.166954040527344 AND lon = 5.430534839630127;\").content\n",
+    "columns_list = ast.literal_eval(columns.strip(\"```python\\n\").strip())\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "climateqa",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}