File size: 33,528 Bytes
97c1a8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bb5de3
 
 
97c1a8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import langchain\n",
    "from langchain.agents import OpenAIFunctionsAgent, AgentExecutor\n",
    "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain.tools import PythonAstREPLTool\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from pydantic import BaseModel, Field\n",
    "from langchain.memory import ConversationBufferMemory\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "langchain.debug = True\n",
    "data_dir_path = os.path.join(os.getcwd())\n",
    "pd.set_option('display.max_rows', 20)\n",
    "pd.set_option('display.max_columns', 20)\n",
    "\n",
    "NUM_ROWS_IN_HEAD = 5\n",
    "\n",
    "# {dataframe_heads_str}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT_TEMPLATE = \"\"\"You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\n",
    "The ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\n",
    "\n",
    "This is the result of running `df.head().to_markdown()` on each of the dataframes:\n",
    "\n",
    "{dataframe_heads_str}\n",
    "You can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\n",
    "\n",
    "Please work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\n",
    "\n",
    "Be precise, analytical, thorough.\n",
    "\n",
    "Here is a history of the conversation with the user so far:\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PythonInputs(BaseModel):\n",
    "    query: str = Field(description=\"code snippet to run\")\n",
    "\n",
    "format_df_for_prompt = lambda df: f'<df>\\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\\n</df>'\n",
    "\n",
    "entries_a_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))\n",
    "entries_b_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))\n",
    "template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))\n",
    "\n",
    "df_name_to_df_map = {\"source_df_1\": entries_a_df, \"source_df_2\": entries_b_df, \"template_df\": template_df}\n",
    "\n",
    "dataframe_heads_str_list: str = []\n",
    "for df_name, df in df_name_to_df_map.items():\n",
    "    dataframe_heads_str_list.append(f'<{df_name}>\\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\\n</{df_name}>')\n",
    "\n",
    "prompt_template = PROMPT_TEMPLATE.format(dataframe_heads_str=\"\\n\\n\".join(dataframe_heads_str_list))\n",
    "\n",
    "prompt = ChatPromptTemplate.from_messages([\n",
    "    (\"system\", prompt_template),\n",
    "    MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
    "    (\"human\", \"{input}\")\n",
    "])\n",
    "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n",
    "\n",
    "repl = PythonAstREPLTool(locals=df_name_to_df_map, name=\"python_repl\",\n",
    "                            description=\"Runs code and returns the output of the final line\",\n",
    "                            args_schema=PythonInputs)\n",
    "tools = [repl]\n",
    "agent = OpenAIFunctionsAgent(llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\"), prompt=prompt, tools=tools, memory=memory, handle_parsing_errors=True)\n",
    "agent_executor = AgentExecutor(agent=agent, tools=tools, max_iterations=5, early_stopping_method=\"generate\", handle_parsing_errors=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor] Entering Chain run with input:\n",
      "\u001b[0m{\n",
      "  \"input\": \"What are the key differences between the dataframe schemas?\",\n",
      "  \"chat_history\": []\n",
      "}\n",
      "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:\n",
      "\u001b[0m{\n",
      "  \"prompts\": [\n",
      "    \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\n</source_df_1>\\n\\n<source_df_2>\\n|    | Date_of_Case   |   Fee | FullName     | CaseNumber   | CaseKind   | Location   |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n|  0 | 2023/05/12     |   100 | Miguel Kim   | CASE-8206    | Civil      | BOST       |\\n|  1 | 2023/04/20     |   150 | John Lee     | CASE-4328    | Criminl    | HOUST      |\\n|  2 | 2023/02/10     |   200 | Dmitri Smith | CASE-1915    | Criminal   | CHIC       |\\n|  3 | 2023/03/16     |   100 | Dmitri Patel | CASE-4283    | Criminal   | BOSTO      |\\n|  4 | 2023/06/15     |   200 | Jane Ivanov  | CASE-7732    | Family     | HOUST      |\\n</source_df_2>\\n\\n<template_df>\\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n|  0 | 2023-05-12 | Miguel Kim   | Civil      | CASE-6761 |   100 | Boston         |\\n|  1 | 2023-04-20 | John Lee     | Criminl    | CASE-6089 |   150 | Houston        |\\n|  2 | 2023-02-10 | Dmitri Smith | Criminal   | CASE-9565 |   200 | Chicago        |\\n|  3 | 2023-03-16 | Dmitri Patel | Criminal   | CASE-6222 |   100 | Boston         |\\n|  4 | 2023-06-15 | Jane Ivanov  | Family     | CASE-2702 |   200 | Houston        |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nHuman: What are the key differences between the dataframe schemas?\"\n",
      "  ]\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [16.60s] Exiting LLM run with output:\n",
      "\u001b[0m{\n",
      "  \"generations\": [\n",
      "    [\n",
      "      {\n",
      "        \"text\": \"\",\n",
      "        \"generation_info\": {\n",
      "          \"finish_reason\": \"function_call\"\n",
      "        },\n",
      "        \"message\": {\n",
      "          \"lc\": 1,\n",
      "          \"type\": \"constructor\",\n",
      "          \"id\": [\n",
      "            \"langchain\",\n",
      "            \"schema\",\n",
      "            \"messages\",\n",
      "            \"AIMessage\"\n",
      "          ],\n",
      "          \"kwargs\": {\n",
      "            \"content\": \"\",\n",
      "            \"additional_kwargs\": {\n",
      "              \"function_call\": {\n",
      "                \"name\": \"python_repl\",\n",
      "                \"arguments\": \"{\\n  \\\"query\\\": \\\"import pandas as pd\\\\n\\\\nsource_df_1 = pd.DataFrame({'case_date': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'lastname': ['Kim', 'Lee', 'Smith', 'Patel', 'Ivanov'], 'firstname': ['Miguel', 'John', 'Dmitri', 'Dmitri', 'Jane'], 'case_type': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'case_id': ['CR-1095', 'CR-8597', 'CR-6833', 'CR-2899', 'CR-5997'], 'court_fee': [100, 150, 200, 100, 200], 'jurisdiction': ['BOSTON', 'houston', 'chicago', 'BOSTON', 'houston']})\\\\n\\\\nsource_df_2 = pd.DataFrame({'Date_of_Case': ['2023/05/12', '2023/04/20', '2023/02/10', '2023/03/16', '2023/06/15'], 'Fee': [100, 150, 200, 100, 200], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseNumber': ['CASE-8206', 'CASE-4328', 'CASE-1915', 'CASE-4283', 'CASE-7732'], 'CaseKind': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'Location': ['BOST', 'HOUST', 'CHIC', 'BOSTO', 'HOUST']})\\\\n\\\\ntemplate_df = pd.DataFrame({'CaseDate': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseType': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'CaseID': ['CASE-6761', 'CASE-6089', 'CASE-9565', 'CASE-6222', 'CASE-2702'], 'Fee': [100, 150, 200, 100, 200], 'Jurisdiction': ['Boston', 'Houston', 'Chicago', 'Boston', 'Houston']})\\\\n\\\\nsource_df_1.head().to_markdown()\\\"\\n}\"\n",
      "              }\n",
      "            }\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    ]\n",
      "  ],\n",
      "  \"llm_output\": {\n",
      "    \"token_usage\": {\n",
      "      \"prompt_tokens\": 932,\n",
      "      \"completion_tokens\": 599,\n",
      "      \"total_tokens\": 1531\n",
      "    },\n",
      "    \"model_name\": \"gpt-3.5-turbo-0613\"\n",
      "  },\n",
      "  \"run\": null\n",
      "}\n",
      "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl] Entering Tool run with input:\n",
      "\u001b[0m\"{'query': \"import pandas as pd\\n\\nsource_df_1 = pd.DataFrame({'case_date': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'lastname': ['Kim', 'Lee', 'Smith', 'Patel', 'Ivanov'], 'firstname': ['Miguel', 'John', 'Dmitri', 'Dmitri', 'Jane'], 'case_type': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'case_id': ['CR-1095', 'CR-8597', 'CR-6833', 'CR-2899', 'CR-5997'], 'court_fee': [100, 150, 200, 100, 200], 'jurisdiction': ['BOSTON', 'houston', 'chicago', 'BOSTON', 'houston']})\\n\\nsource_df_2 = pd.DataFrame({'Date_of_Case': ['2023/05/12', '2023/04/20', '2023/02/10', '2023/03/16', '2023/06/15'], 'Fee': [100, 150, 200, 100, 200], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseNumber': ['CASE-8206', 'CASE-4328', 'CASE-1915', 'CASE-4283', 'CASE-7732'], 'CaseKind': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'Location': ['BOST', 'HOUST', 'CHIC', 'BOSTO', 'HOUST']})\\n\\ntemplate_df = pd.DataFrame({'CaseDate': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseType': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'CaseID': ['CASE-6761', 'CASE-6089', 'CASE-9565', 'CASE-6222', 'CASE-2702'], 'Fee': [100, 150, 200, 100, 200], 'Jurisdiction': ['Boston', 'Houston', 'Chicago', 'Boston', 'Houston']})\\n\\nsource_df_1.head().to_markdown()\"}\"\n",
      "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl] [7ms] Exiting Tool run with output:\n",
      "\u001b[0m\"|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\n",
      "|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n",
      "|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\n",
      "|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\n",
      "|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\n",
      "|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\n",
      "|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\"\n",
      "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] Entering LLM run with input:\n",
      "\u001b[0m{\n",
      "  \"prompts\": [\n",
      "    \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\n</source_df_1>\\n\\n<source_df_2>\\n|    | Date_of_Case   |   Fee | FullName     | CaseNumber   | CaseKind   | Location   |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n|  0 | 2023/05/12     |   100 | Miguel Kim   | CASE-8206    | Civil      | BOST       |\\n|  1 | 2023/04/20     |   150 | John Lee     | CASE-4328    | Criminl    | HOUST      |\\n|  2 | 2023/02/10     |   200 | Dmitri Smith | CASE-1915    | Criminal   | CHIC       |\\n|  3 | 2023/03/16     |   100 | Dmitri Patel | CASE-4283    | Criminal   | BOSTO      |\\n|  4 | 2023/06/15     |   200 | Jane Ivanov  | CASE-7732    | Family     | HOUST      |\\n</source_df_2>\\n\\n<template_df>\\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n|  0 | 2023-05-12 | Miguel Kim   | Civil      | CASE-6761 |   100 | Boston         |\\n|  1 | 2023-04-20 | John Lee     | Criminl    | CASE-6089 |   150 | Houston        |\\n|  2 | 2023-02-10 | Dmitri Smith | Criminal   | CASE-9565 |   200 | Chicago        |\\n|  3 | 2023-03-16 | Dmitri Patel | Criminal   | CASE-6222 |   100 | Boston         |\\n|  4 | 2023-06-15 | Jane Ivanov  | Family     | CASE-2702 |   200 | Houston        |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n  \\\"query\\\": \\\"import pandas as pd\\\\\\\\n\\\\\\\\nsource_df_1 = pd.DataFrame({\\\\'case_date\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'lastname\\\\': [\\\\'Kim\\\\', \\\\'Lee\\\\', \\\\'Smith\\\\', \\\\'Patel\\\\', \\\\'Ivanov\\\\'], \\\\'firstname\\\\': [\\\\'Miguel\\\\', \\\\'John\\\\', \\\\'Dmitri\\\\', \\\\'Dmitri\\\\', \\\\'Jane\\\\'], \\\\'case_type\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'case_id\\\\': [\\\\'CR-1095\\\\', \\\\'CR-8597\\\\', \\\\'CR-6833\\\\', \\\\'CR-2899\\\\', \\\\'CR-5997\\\\'], \\\\'court_fee\\\\': [100, 150, 200, 100, 200], \\\\'jurisdiction\\\\': [\\\\'BOSTON\\\\', \\\\'houston\\\\', \\\\'chicago\\\\', \\\\'BOSTON\\\\', \\\\'houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_2 = pd.DataFrame({\\\\'Date_of_Case\\\\': [\\\\'2023/05/12\\\\', \\\\'2023/04/20\\\\', \\\\'2023/02/10\\\\', \\\\'2023/03/16\\\\', \\\\'2023/06/15\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseNumber\\\\': [\\\\'CASE-8206\\\\', \\\\'CASE-4328\\\\', \\\\'CASE-1915\\\\', \\\\'CASE-4283\\\\', \\\\'CASE-7732\\\\'], \\\\'CaseKind\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'Location\\\\': [\\\\'BOST\\\\', \\\\'HOUST\\\\', \\\\'CHIC\\\\', \\\\'BOSTO\\\\', \\\\'HOUST\\\\']})\\\\\\\\n\\\\\\\\ntemplate_df = pd.DataFrame({\\\\'CaseDate\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseType\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'CaseID\\\\': [\\\\'CASE-6761\\\\', \\\\'CASE-6089\\\\', \\\\'CASE-9565\\\\', \\\\'CASE-6222\\\\', \\\\'CASE-2702\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'Jurisdiction\\\\': [\\\\'Boston\\\\', \\\\'Houston\\\\', \\\\'Chicago\\\\', \\\\'Boston\\\\', \\\\'Houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_1.head().to_markdown()\\\"\\\\n}'}\\nFunction: |    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\nHuman: What are the key differences between the dataframe schemas?\"\n",
      "  ]\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] [1.18s] Exiting LLM run with output:\n",
      "\u001b[0m{\n",
      "  \"generations\": [\n",
      "    [\n",
      "      {\n",
      "        \"text\": \"\",\n",
      "        \"generation_info\": {\n",
      "          \"finish_reason\": \"function_call\"\n",
      "        },\n",
      "        \"message\": {\n",
      "          \"lc\": 1,\n",
      "          \"type\": \"constructor\",\n",
      "          \"id\": [\n",
      "            \"langchain\",\n",
      "            \"schema\",\n",
      "            \"messages\",\n",
      "            \"AIMessage\"\n",
      "          ],\n",
      "          \"kwargs\": {\n",
      "            \"content\": \"\",\n",
      "            \"additional_kwargs\": {\n",
      "              \"function_call\": {\n",
      "                \"name\": \"python_repl\",\n",
      "                \"arguments\": \"{\\n  \\\"query\\\": \\\"set(source_df_1.columns) - set(template_df.columns)\\\"\\n}\"\n",
      "              }\n",
      "            }\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    ]\n",
      "  ],\n",
      "  \"llm_output\": {\n",
      "    \"token_usage\": {\n",
      "      \"prompt_tokens\": 1784,\n",
      "      \"completion_tokens\": 27,\n",
      "      \"total_tokens\": 1811\n",
      "    },\n",
      "    \"model_name\": \"gpt-3.5-turbo-0613\"\n",
      "  },\n",
      "  \"run\": null\n",
      "}\n",
      "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl] Entering Tool run with input:\n",
      "\u001b[0m\"{'query': 'set(source_df_1.columns) - set(template_df.columns)'}\"\n",
      "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl] [0ms] Exiting Tool run with output:\n",
      "\u001b[0m\"{'case_id', 'firstname', 'court_fee', 'case_type', 'lastname', 'case_date', 'jurisdiction'}\"\n",
      "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] Entering LLM run with input:\n",
      "\u001b[0m{\n",
      "  \"prompts\": [\n",
      "    \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\n</source_df_1>\\n\\n<source_df_2>\\n|    | Date_of_Case   |   Fee | FullName     | CaseNumber   | CaseKind   | Location   |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n|  0 | 2023/05/12     |   100 | Miguel Kim   | CASE-8206    | Civil      | BOST       |\\n|  1 | 2023/04/20     |   150 | John Lee     | CASE-4328    | Criminl    | HOUST      |\\n|  2 | 2023/02/10     |   200 | Dmitri Smith | CASE-1915    | Criminal   | CHIC       |\\n|  3 | 2023/03/16     |   100 | Dmitri Patel | CASE-4283    | Criminal   | BOSTO      |\\n|  4 | 2023/06/15     |   200 | Jane Ivanov  | CASE-7732    | Family     | HOUST      |\\n</source_df_2>\\n\\n<template_df>\\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n|  0 | 2023-05-12 | Miguel Kim   | Civil      | CASE-6761 |   100 | Boston         |\\n|  1 | 2023-04-20 | John Lee     | Criminl    | CASE-6089 |   150 | Houston        |\\n|  2 | 2023-02-10 | Dmitri Smith | Criminal   | CASE-9565 |   200 | Chicago        |\\n|  3 | 2023-03-16 | Dmitri Patel | Criminal   | CASE-6222 |   100 | Boston         |\\n|  4 | 2023-06-15 | Jane Ivanov  | Family     | CASE-2702 |   200 | Houston        |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n  \\\"query\\\": \\\"import pandas as pd\\\\\\\\n\\\\\\\\nsource_df_1 = pd.DataFrame({\\\\'case_date\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'lastname\\\\': [\\\\'Kim\\\\', \\\\'Lee\\\\', \\\\'Smith\\\\', \\\\'Patel\\\\', \\\\'Ivanov\\\\'], \\\\'firstname\\\\': [\\\\'Miguel\\\\', \\\\'John\\\\', \\\\'Dmitri\\\\', \\\\'Dmitri\\\\', \\\\'Jane\\\\'], \\\\'case_type\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'case_id\\\\': [\\\\'CR-1095\\\\', \\\\'CR-8597\\\\', \\\\'CR-6833\\\\', \\\\'CR-2899\\\\', \\\\'CR-5997\\\\'], \\\\'court_fee\\\\': [100, 150, 200, 100, 200], \\\\'jurisdiction\\\\': [\\\\'BOSTON\\\\', \\\\'houston\\\\', \\\\'chicago\\\\', \\\\'BOSTON\\\\', \\\\'houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_2 = pd.DataFrame({\\\\'Date_of_Case\\\\': [\\\\'2023/05/12\\\\', \\\\'2023/04/20\\\\', \\\\'2023/02/10\\\\', \\\\'2023/03/16\\\\', \\\\'2023/06/15\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseNumber\\\\': [\\\\'CASE-8206\\\\', \\\\'CASE-4328\\\\', \\\\'CASE-1915\\\\', \\\\'CASE-4283\\\\', \\\\'CASE-7732\\\\'], \\\\'CaseKind\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'Location\\\\': [\\\\'BOST\\\\', \\\\'HOUST\\\\', \\\\'CHIC\\\\', \\\\'BOSTO\\\\', \\\\'HOUST\\\\']})\\\\\\\\n\\\\\\\\ntemplate_df = pd.DataFrame({\\\\'CaseDate\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseType\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'CaseID\\\\': [\\\\'CASE-6761\\\\', \\\\'CASE-6089\\\\', \\\\'CASE-9565\\\\', \\\\'CASE-6222\\\\', \\\\'CASE-2702\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'Jurisdiction\\\\': [\\\\'Boston\\\\', \\\\'Houston\\\\', \\\\'Chicago\\\\', \\\\'Boston\\\\', \\\\'Houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_1.head().to_markdown()\\\"\\\\n}'}\\nFunction: |    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n  \\\"query\\\": \\\"set(source_df_1.columns) - set(template_df.columns)\\\"\\\\n}'}\\nFunction: {'case_id', 'firstname', 'court_fee', 'case_type', 'lastname', 'case_date', 'jurisdiction'}\\nHuman: What are the key differences between the dataframe schemas?\"\n",
      "  ]\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] [8.40s] Exiting LLM run with output:\n",
      "\u001b[0m{\n",
      "  \"generations\": [\n",
      "    [\n",
      "      {\n",
      "        \"text\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\",\n",
      "        \"generation_info\": {\n",
      "          \"finish_reason\": \"stop\"\n",
      "        },\n",
      "        \"message\": {\n",
      "          \"lc\": 1,\n",
      "          \"type\": \"constructor\",\n",
      "          \"id\": [\n",
      "            \"langchain\",\n",
      "            \"schema\",\n",
      "            \"messages\",\n",
      "            \"AIMessage\"\n",
      "          ],\n",
      "          \"kwargs\": {\n",
      "            \"content\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\",\n",
      "            \"additional_kwargs\": {}\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    ]\n",
      "  ],\n",
      "  \"llm_output\": {\n",
      "    \"token_usage\": {\n",
      "      \"prompt_tokens\": 1846,\n",
      "      \"completion_tokens\": 271,\n",
      "      \"total_tokens\": 2117\n",
      "    },\n",
      "    \"model_name\": \"gpt-3.5-turbo-0613\"\n",
      "  },\n",
      "  \"run\": null\n",
      "}\n",
      "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor] [26.19s] Exiting Chain run with output:\n",
      "\u001b[0m{\n",
      "  \"output\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "question = \"What are the key differences between the dataframe schemas?\"\n",
    "res = agent_executor.run(input=question, chat_history=memory.chat_memory.messages)\n",
    "memory.chat_memory.add_user_message(question)\n",
    "memory.chat_memory.add_ai_message(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_differences_between_dataframes\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}