Spaces:

andymbryant
/

data-mapper

Runtime error

File size: 33,528 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import langchain\n",
    "from langchain.agents import OpenAIFunctionsAgent, AgentExecutor\n",
    "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain.tools import PythonAstREPLTool\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from pydantic import BaseModel, Field\n",
    "from langchain.memory import ConversationBufferMemory\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "langchain.debug = True\n",
    "data_dir_path = os.path.join(os.getcwd())\n",
    "pd.set_option('display.max_rows', 20)\n",
    "pd.set_option('display.max_columns', 20)\n",
    "\n",
    "NUM_ROWS_IN_HEAD = 5\n",
    "\n",
    "# {dataframe_heads_str}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT_TEMPLATE = \"\"\"You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\n",
    "The ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\n",
    "\n",
    "This is the result of running `df.head().to_markdown()` on each of the dataframes:\n",
    "\n",
    "{dataframe_heads_str}\n",
    "You can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\n",
    "\n",
    "Please work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\n",
    "\n",
    "Be precise, analytical, thorough.\n",
    "\n",
    "Here is a history of the conversation with the user so far:\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PythonInputs(BaseModel):\n",
    "    query: str = Field(description=\"code snippet to run\")\n",
    "\n",
    "format_df_for_prompt = lambda df: f'<df>\\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\\n</df>'\n",
    "\n",
    "entries_a_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))\n",
    "entries_b_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))\n",
    "template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))\n",
    "\n",
    "df_name_to_df_map = {\"source_df_1\": entries_a_df, \"source_df_2\": entries_b_df, \"template_df\": template_df}\n",
    "\n",
    "dataframe_heads_str_list: str = []\n",
    "for df_name, df in df_name_to_df_map.items():\n",
    "    dataframe_heads_str_list.append(f'<{df_name}>\\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\\n</{df_name}>')\n",
    "\n",
    "prompt_template = PROMPT_TEMPLATE.format(dataframe_heads_str=\"\\n\\n\".join(dataframe_heads_str_list))\n",
    "\n",
    "prompt = ChatPromptTemplate.from_messages([\n",
    "    (\"system\", prompt_template),\n",
    "    MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
    "    (\"human\", \"{input}\")\n",
    "])\n",
    "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n",
    "\n",
    "repl = PythonAstREPLTool(locals=df_name_to_df_map, name=\"python_repl\",\n",
    "                            description=\"Runs code and returns the output of the final line\",\n",
    "                            args_schema=PythonInputs)\n",
    "tools = [repl]\n",
    "agent = OpenAIFunctionsAgent(llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\"), prompt=prompt, tools=tools, memory=memory, handle_parsing_errors=True)\n",
    "agent_executor = AgentExecutor(agent=agent, tools=tools, max_iterations=5, early_stopping_method=\"generate\", handle_parsing_errors=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor] Entering Chain run with input:\n",
      "\u001b[0m{\n",
      "  \"input\": \"What are the key differences between the dataframe schemas?\",\n",
      "  \"chat_history\": []\n",
      "}\n",
      "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:\n",
      "\u001b[0m{\n",
      "  \"prompts\": [\n",
      "    \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\n</source_df_1>\\n\\n<source_df_2>\\n|    | Date_of_Case   |   Fee | FullName     | CaseNumber   | CaseKind   | Location   |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n|  0 | 2023/05/12     |   100 | Miguel Kim   | CASE-8206    | Civil      | BOST       |\\n|  1 | 2023/04/20     |   150 | John Lee     | CASE-4328    | Criminl    | HOUST      |\\n|  2 | 2023/02/10     |   200 | Dmitri Smith | CASE-1915    | Criminal   | CHIC       |\\n|  3 | 2023/03/16     |   100 | Dmitri Patel | CASE-4283    | Criminal   | BOSTO      |\\n|  4 | 2023/06/15     |   200 | Jane Ivanov  | CASE-7732    | Family     | HOUST      |\\n</source_df_2>\\n\\n<template_df>\\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n|  0 | 2023-05-12 | Miguel Kim   | Civil      | CASE-6761 |   100 | Boston         |\\n|  1 | 2023-04-20 | John Lee     | Criminl    | CASE-6089 |   150 | Houston        |\\n|  2 | 2023-02-10 | Dmitri Smith | Criminal   | CASE-9565 |   200 | Chicago        |\\n|  3 | 2023-03-16 | Dmitri Patel | Criminal   | CASE-6222 |   100 | Boston         |\\n|  4 | 2023-06-15 | Jane Ivanov  | Family     | CASE-2702 |   200 | Houston        |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nHuman: What are the key differences between the dataframe schemas?\"\n",
      "  ]\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [16.60s] Exiting LLM run with output:\n",
      "\u001b[0m{\n",
      "  \"generations\": [\n",
      "    [\n",
      "      {\n",
      "        \"text\": \"\",\n",
      "        \"generation_info\": {\n",
      "          \"finish_reason\": \"function_call\"\n",
      "        },\n",
      "        \"message\": {\n",
      "          \"lc\": 1,\n",
      "          \"type\": \"constructor\",\n",
      "          \"id\": [\n",
      "            \"langchain\",\n",
      "            \"schema\",\n",
      "            \"messages\",\n",
      "            \"AIMessage\"\n",
      "          ],\n",
      "          \"kwargs\": {\n",
      "            \"content\": \"\",\n",
      "            \"additional_kwargs\": {\n",
      "              \"function_call\": {\n",
      "                \"name\": \"python_repl\",\n",
      "                \"arguments\": \"{\\n  \\\"query\\\": \\\"import pandas as pd\\\\n\\\\nsource_df_1 = pd.DataFrame({'case_date': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'lastname': ['Kim', 'Lee', 'Smith', 'Patel', 'Ivanov'], 'firstname': ['Miguel', 'John', 'Dmitri', 'Dmitri', 'Jane'], 'case_type': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'case_id': ['CR-1095', 'CR-8597', 'CR-6833', 'CR-2899', 'CR-5997'], 'court_fee': [100, 150, 200, 100, 200], 'jurisdiction': ['BOSTON', 'houston', 'chicago', 'BOSTON', 'houston']})\\\\n\\\\nsource_df_2 = pd.DataFrame({'Date_of_Case': ['2023/05/12', '2023/04/20', '2023/02/10', '2023/03/16', '2023/06/15'], 'Fee': [100, 150, 200, 100, 200], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseNumber': ['CASE-8206', 'CASE-4328', 'CASE-1915', 'CASE-4283', 'CASE-7732'], 'CaseKind': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'Location': ['BOST', 'HOUST', 'CHIC', 'BOSTO', 'HOUST']})\\\\n\\\\ntemplate_df = pd.DataFrame({'CaseDate': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseType': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'CaseID': ['CASE-6761', 'CASE-6089', 'CASE-9565', 'CASE-6222', 'CASE-2702'], 'Fee': [100, 150, 200, 100, 200], 'Jurisdiction': ['Boston', 'Houston', 'Chicago', 'Boston', 'Houston']})\\\\n\\\\nsource_df_1.head().to_markdown()\\\"\\n}\"\n",
      "              }\n",
      "            }\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    ]\n",
      "  ],\n",
      "  \"llm_output\": {\n",
      "    \"token_usage\": {\n",
      "      \"prompt_tokens\": 932,\n",
      "      \"completion_tokens\": 599,\n",
      "      \"total_tokens\": 1531\n",
      "    },\n",
      "    \"model_name\": \"gpt-3.5-turbo-0613\"\n",
      "  },\n",
      "  \"run\": null\n",
      "}\n",
      "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl] Entering Tool run with input:\n",
      "\u001b[0m\"{'query': \"import pandas as pd\\n\\nsource_df_1 = pd.DataFrame({'case_date': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'lastname': ['Kim', 'Lee', 'Smith', 'Patel', 'Ivanov'], 'firstname': ['Miguel', 'John', 'Dmitri', 'Dmitri', 'Jane'], 'case_type': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'case_id': ['CR-1095', 'CR-8597', 'CR-6833', 'CR-2899', 'CR-5997'], 'court_fee': [100, 150, 200, 100, 200], 'jurisdiction': ['BOSTON', 'houston', 'chicago', 'BOSTON', 'houston']})\\n\\nsource_df_2 = pd.DataFrame({'Date_of_Case': ['2023/05/12', '2023/04/20', '2023/02/10', '2023/03/16', '2023/06/15'], 'Fee': [100, 150, 200, 100, 200], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseNumber': ['CASE-8206', 'CASE-4328', 'CASE-1915', 'CASE-4283', 'CASE-7732'], 'CaseKind': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'Location': ['BOST', 'HOUST', 'CHIC', 'BOSTO', 'HOUST']})\\n\\ntemplate_df = pd.DataFrame({'CaseDate': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseType': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'CaseID': ['CASE-6761', 'CASE-6089', 'CASE-9565', 'CASE-6222', 'CASE-2702'], 'Fee': [100, 150, 200, 100, 200], 'Jurisdiction': ['Boston', 'Houston', 'Chicago', 'Boston', 'Houston']})\\n\\nsource_df_1.head().to_markdown()\"}\"\n",
      "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl] [7ms] Exiting Tool run with output:\n",
      "\u001b[0m\"|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\n",
      "|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n",
      "|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\n",
      "|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\n",
      "|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\n",
      "|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\n",
      "|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\"\n",
      "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] Entering LLM run with input:\n",
      "\u001b[0m{\n",
      "  \"prompts\": [\n",
      "    \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\n</source_df_1>\\n\\n<source_df_2>\\n|    | Date_of_Case   |   Fee | FullName     | CaseNumber   | CaseKind   | Location   |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n|  0 | 2023/05/12     |   100 | Miguel Kim   | CASE-8206    | Civil      | BOST       |\\n|  1 | 2023/04/20     |   150 | John Lee     | CASE-4328    | Criminl    | HOUST      |\\n|  2 | 2023/02/10     |   200 | Dmitri Smith | CASE-1915    | Criminal   | CHIC       |\\n|  3 | 2023/03/16     |   100 | Dmitri Patel | CASE-4283    | Criminal   | BOSTO      |\\n|  4 | 2023/06/15     |   200 | Jane Ivanov  | CASE-7732    | Family     | HOUST      |\\n</source_df_2>\\n\\n<template_df>\\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n|  0 | 2023-05-12 | Miguel Kim   | Civil      | CASE-6761 |   100 | Boston         |\\n|  1 | 2023-04-20 | John Lee     | Criminl    | CASE-6089 |   150 | Houston        |\\n|  2 | 2023-02-10 | Dmitri Smith | Criminal   | CASE-9565 |   200 | Chicago        |\\n|  3 | 2023-03-16 | Dmitri Patel | Criminal   | CASE-6222 |   100 | Boston         |\\n|  4 | 2023-06-15 | Jane Ivanov  | Family     | CASE-2702 |   200 | Houston        |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n  \\\"query\\\": \\\"import pandas as pd\\\\\\\\n\\\\\\\\nsource_df_1 = pd.DataFrame({\\\\'case_date\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'lastname\\\\': [\\\\'Kim\\\\', \\\\'Lee\\\\', \\\\'Smith\\\\', \\\\'Patel\\\\', \\\\'Ivanov\\\\'], \\\\'firstname\\\\': [\\\\'Miguel\\\\', \\\\'John\\\\', \\\\'Dmitri\\\\', \\\\'Dmitri\\\\', \\\\'Jane\\\\'], \\\\'case_type\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'case_id\\\\': [\\\\'CR-1095\\\\', \\\\'CR-8597\\\\', \\\\'CR-6833\\\\', \\\\'CR-2899\\\\', \\\\'CR-5997\\\\'], \\\\'court_fee\\\\': [100, 150, 200, 100, 200], \\\\'jurisdiction\\\\': [\\\\'BOSTON\\\\', \\\\'houston\\\\', \\\\'chicago\\\\', \\\\'BOSTON\\\\', \\\\'houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_2 = pd.DataFrame({\\\\'Date_of_Case\\\\': [\\\\'2023/05/12\\\\', \\\\'2023/04/20\\\\', \\\\'2023/02/10\\\\', \\\\'2023/03/16\\\\', \\\\'2023/06/15\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseNumber\\\\': [\\\\'CASE-8206\\\\', \\\\'CASE-4328\\\\', \\\\'CASE-1915\\\\', \\\\'CASE-4283\\\\', \\\\'CASE-7732\\\\'], \\\\'CaseKind\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'Location\\\\': [\\\\'BOST\\\\', \\\\'HOUST\\\\', \\\\'CHIC\\\\', \\\\'BOSTO\\\\', \\\\'HOUST\\\\']})\\\\\\\\n\\\\\\\\ntemplate_df = pd.DataFrame({\\\\'CaseDate\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseType\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'CaseID\\\\': [\\\\'CASE-6761\\\\', \\\\'CASE-6089\\\\', \\\\'CASE-9565\\\\', \\\\'CASE-6222\\\\', \\\\'CASE-2702\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'Jurisdiction\\\\': [\\\\'Boston\\\\', \\\\'Houston\\\\', \\\\'Chicago\\\\', \\\\'Boston\\\\', \\\\'Houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_1.head().to_markdown()\\\"\\\\n}'}\\nFunction: |    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\nHuman: What are the key differences between the dataframe schemas?\"\n",
      "  ]\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] [1.18s] Exiting LLM run with output:\n",
      "\u001b[0m{\n",
      "  \"generations\": [\n",
      "    [\n",
      "      {\n",
      "        \"text\": \"\",\n",
      "        \"generation_info\": {\n",
      "          \"finish_reason\": \"function_call\"\n",
      "        },\n",
      "        \"message\": {\n",
      "          \"lc\": 1,\n",
      "          \"type\": \"constructor\",\n",
      "          \"id\": [\n",
      "            \"langchain\",\n",
      "            \"schema\",\n",
      "            \"messages\",\n",
      "            \"AIMessage\"\n",
      "          ],\n",
      "          \"kwargs\": {\n",
      "            \"content\": \"\",\n",
      "            \"additional_kwargs\": {\n",
      "              \"function_call\": {\n",
      "                \"name\": \"python_repl\",\n",
      "                \"arguments\": \"{\\n  \\\"query\\\": \\\"set(source_df_1.columns) - set(template_df.columns)\\\"\\n}\"\n",
      "              }\n",
      "            }\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    ]\n",
      "  ],\n",
      "  \"llm_output\": {\n",
      "    \"token_usage\": {\n",
      "      \"prompt_tokens\": 1784,\n",
      "      \"completion_tokens\": 27,\n",
      "      \"total_tokens\": 1811\n",
      "    },\n",
      "    \"model_name\": \"gpt-3.5-turbo-0613\"\n",
      "  },\n",
      "  \"run\": null\n",
      "}\n",
      "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl] Entering Tool run with input:\n",
      "\u001b[0m\"{'query': 'set(source_df_1.columns) - set(template_df.columns)'}\"\n",
      "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl] [0ms] Exiting Tool run with output:\n",
      "\u001b[0m\"{'case_id', 'firstname', 'court_fee', 'case_type', 'lastname', 'case_date', 'jurisdiction'}\"\n",
      "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] Entering LLM run with input:\n",
      "\u001b[0m{\n",
      "  \"prompts\": [\n",
      "    \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\n</source_df_1>\\n\\n<source_df_2>\\n|    | Date_of_Case   |   Fee | FullName     | CaseNumber   | CaseKind   | Location   |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n|  0 | 2023/05/12     |   100 | Miguel Kim   | CASE-8206    | Civil      | BOST       |\\n|  1 | 2023/04/20     |   150 | John Lee     | CASE-4328    | Criminl    | HOUST      |\\n|  2 | 2023/02/10     |   200 | Dmitri Smith | CASE-1915    | Criminal   | CHIC       |\\n|  3 | 2023/03/16     |   100 | Dmitri Patel | CASE-4283    | Criminal   | BOSTO      |\\n|  4 | 2023/06/15     |   200 | Jane Ivanov  | CASE-7732    | Family     | HOUST      |\\n</source_df_2>\\n\\n<template_df>\\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n|  0 | 2023-05-12 | Miguel Kim   | Civil      | CASE-6761 |   100 | Boston         |\\n|  1 | 2023-04-20 | John Lee     | Criminl    | CASE-6089 |   150 | Houston        |\\n|  2 | 2023-02-10 | Dmitri Smith | Criminal   | CASE-9565 |   200 | Chicago        |\\n|  3 | 2023-03-16 | Dmitri Patel | Criminal   | CASE-6222 |   100 | Boston         |\\n|  4 | 2023-06-15 | Jane Ivanov  | Family     | CASE-2702 |   200 | Houston        |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n  \\\"query\\\": \\\"import pandas as pd\\\\\\\\n\\\\\\\\nsource_df_1 = pd.DataFrame({\\\\'case_date\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'lastname\\\\': [\\\\'Kim\\\\', \\\\'Lee\\\\', \\\\'Smith\\\\', \\\\'Patel\\\\', \\\\'Ivanov\\\\'], \\\\'firstname\\\\': [\\\\'Miguel\\\\', \\\\'John\\\\', \\\\'Dmitri\\\\', \\\\'Dmitri\\\\', \\\\'Jane\\\\'], \\\\'case_type\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'case_id\\\\': [\\\\'CR-1095\\\\', \\\\'CR-8597\\\\', \\\\'CR-6833\\\\', \\\\'CR-2899\\\\', \\\\'CR-5997\\\\'], \\\\'court_fee\\\\': [100, 150, 200, 100, 200], \\\\'jurisdiction\\\\': [\\\\'BOSTON\\\\', \\\\'houston\\\\', \\\\'chicago\\\\', \\\\'BOSTON\\\\', \\\\'houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_2 = pd.DataFrame({\\\\'Date_of_Case\\\\': [\\\\'2023/05/12\\\\', \\\\'2023/04/20\\\\', \\\\'2023/02/10\\\\', \\\\'2023/03/16\\\\', \\\\'2023/06/15\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseNumber\\\\': [\\\\'CASE-8206\\\\', \\\\'CASE-4328\\\\', \\\\'CASE-1915\\\\', \\\\'CASE-4283\\\\', \\\\'CASE-7732\\\\'], \\\\'CaseKind\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'Location\\\\': [\\\\'BOST\\\\', \\\\'HOUST\\\\', \\\\'CHIC\\\\', \\\\'BOSTO\\\\', \\\\'HOUST\\\\']})\\\\\\\\n\\\\\\\\ntemplate_df = pd.DataFrame({\\\\'CaseDate\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseType\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'CaseID\\\\': [\\\\'CASE-6761\\\\', \\\\'CASE-6089\\\\', \\\\'CASE-9565\\\\', \\\\'CASE-6222\\\\', \\\\'CASE-2702\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'Jurisdiction\\\\': [\\\\'Boston\\\\', \\\\'Houston\\\\', \\\\'Chicago\\\\', \\\\'Boston\\\\', \\\\'Houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_1.head().to_markdown()\\\"\\\\n}'}\\nFunction: |    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n  \\\"query\\\": \\\"set(source_df_1.columns) - set(template_df.columns)\\\"\\\\n}'}\\nFunction: {'case_id', 'firstname', 'court_fee', 'case_type', 'lastname', 'case_date', 'jurisdiction'}\\nHuman: What are the key differences between the dataframe schemas?\"\n",
      "  ]\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] [8.40s] Exiting LLM run with output:\n",
      "\u001b[0m{\n",
      "  \"generations\": [\n",
      "    [\n",
      "      {\n",
      "        \"text\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\",\n",
      "        \"generation_info\": {\n",
      "          \"finish_reason\": \"stop\"\n",
      "        },\n",
      "        \"message\": {\n",
      "          \"lc\": 1,\n",
      "          \"type\": \"constructor\",\n",
      "          \"id\": [\n",
      "            \"langchain\",\n",
      "            \"schema\",\n",
      "            \"messages\",\n",
      "            \"AIMessage\"\n",
      "          ],\n",
      "          \"kwargs\": {\n",
      "            \"content\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\",\n",
      "            \"additional_kwargs\": {}\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    ]\n",
      "  ],\n",
      "  \"llm_output\": {\n",
      "    \"token_usage\": {\n",
      "      \"prompt_tokens\": 1846,\n",
      "      \"completion_tokens\": 271,\n",
      "      \"total_tokens\": 2117\n",
      "    },\n",
      "    \"model_name\": \"gpt-3.5-turbo-0613\"\n",
      "  },\n",
      "  \"run\": null\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor] [26.19s] Exiting Chain run with output:\n",
      "\u001b[0m{\n",
      "  \"output\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "question = \"What are the key differences between the dataframe schemas?\"\n",
    "res = agent_executor.run(input=question, chat_history=memory.chat_memory.messages)\n",
    "memory.chat_memory.add_user_message(question)\n",
    "memory.chat_memory.add_ai_message(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_differences_between_dataframes\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}