Spaces:
Runtime error
Runtime error
File size: 33,528 Bytes
97c1a8e 2bb5de3 97c1a8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import langchain\n",
"from langchain.agents import OpenAIFunctionsAgent, AgentExecutor\n",
"from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
"from langchain.tools import PythonAstREPLTool\n",
"from langchain.chat_models import ChatOpenAI\n",
"from pydantic import BaseModel, Field\n",
"from langchain.memory import ConversationBufferMemory\n",
"from dotenv import load_dotenv\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"langchain.debug = True\n",
"data_dir_path = os.path.join(os.getcwd())\n",
"pd.set_option('display.max_rows', 20)\n",
"pd.set_option('display.max_columns', 20)\n",
"\n",
"NUM_ROWS_IN_HEAD = 5\n",
"\n",
"# {dataframe_heads_str}"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"PROMPT_TEMPLATE = \"\"\"You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\n",
"The ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\n",
"\n",
"This is the result of running `df.head().to_markdown()` on each of the dataframes:\n",
"\n",
"{dataframe_heads_str}\n",
"You can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\n",
"\n",
"Please work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\n",
"\n",
"Be precise, analytical, thorough.\n",
"\n",
"Here is a history of the conversation with the user so far:\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"class PythonInputs(BaseModel):\n",
" query: str = Field(description=\"code snippet to run\")\n",
"\n",
"format_df_for_prompt = lambda df: f'<df>\\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\\n</df>'\n",
"\n",
"entries_a_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))\n",
"entries_b_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))\n",
"template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))\n",
"\n",
"df_name_to_df_map = {\"source_df_1\": entries_a_df, \"source_df_2\": entries_b_df, \"template_df\": template_df}\n",
"\n",
"dataframe_heads_str_list: str = []\n",
"for df_name, df in df_name_to_df_map.items():\n",
" dataframe_heads_str_list.append(f'<{df_name}>\\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\\n</{df_name}>')\n",
"\n",
"prompt_template = PROMPT_TEMPLATE.format(dataframe_heads_str=\"\\n\\n\".join(dataframe_heads_str_list))\n",
"\n",
"prompt = ChatPromptTemplate.from_messages([\n",
" (\"system\", prompt_template),\n",
" MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
" (\"human\", \"{input}\")\n",
"])\n",
"memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n",
"\n",
"repl = PythonAstREPLTool(locals=df_name_to_df_map, name=\"python_repl\",\n",
" description=\"Runs code and returns the output of the final line\",\n",
" args_schema=PythonInputs)\n",
"tools = [repl]\n",
"agent = OpenAIFunctionsAgent(llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\"), prompt=prompt, tools=tools, memory=memory, handle_parsing_errors=True)\n",
"agent_executor = AgentExecutor(agent=agent, tools=tools, max_iterations=5, early_stopping_method=\"generate\", handle_parsing_errors=True)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor] Entering Chain run with input:\n",
"\u001b[0m{\n",
" \"input\": \"What are the key differences between the dataframe schemas?\",\n",
" \"chat_history\": []\n",
"}\n",
"\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:\n",
"\u001b[0m{\n",
" \"prompts\": [\n",
" \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n</source_df_1>\\n\\n<source_df_2>\\n| | Date_of_Case | Fee | FullName | CaseNumber | CaseKind | Location |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n| 0 | 2023/05/12 | 100 | Miguel Kim | CASE-8206 | Civil | BOST |\\n| 1 | 2023/04/20 | 150 | John Lee | CASE-4328 | Criminl | HOUST |\\n| 2 | 2023/02/10 | 200 | Dmitri Smith | CASE-1915 | Criminal | CHIC |\\n| 3 | 2023/03/16 | 100 | Dmitri Patel | CASE-4283 | Criminal | BOSTO |\\n| 4 | 2023/06/15 | 200 | Jane Ivanov | CASE-7732 | Family | HOUST |\\n</source_df_2>\\n\\n<template_df>\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nHuman: What are the key differences between the dataframe schemas?\"\n",
" ]\n",
"}\n",
"\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [16.60s] Exiting LLM run with output:\n",
"\u001b[0m{\n",
" \"generations\": [\n",
" [\n",
" {\n",
" \"text\": \"\",\n",
" \"generation_info\": {\n",
" \"finish_reason\": \"function_call\"\n",
" },\n",
" \"message\": {\n",
" \"lc\": 1,\n",
" \"type\": \"constructor\",\n",
" \"id\": [\n",
" \"langchain\",\n",
" \"schema\",\n",
" \"messages\",\n",
" \"AIMessage\"\n",
" ],\n",
" \"kwargs\": {\n",
" \"content\": \"\",\n",
" \"additional_kwargs\": {\n",
" \"function_call\": {\n",
" \"name\": \"python_repl\",\n",
" \"arguments\": \"{\\n \\\"query\\\": \\\"import pandas as pd\\\\n\\\\nsource_df_1 = pd.DataFrame({'case_date': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'lastname': ['Kim', 'Lee', 'Smith', 'Patel', 'Ivanov'], 'firstname': ['Miguel', 'John', 'Dmitri', 'Dmitri', 'Jane'], 'case_type': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'case_id': ['CR-1095', 'CR-8597', 'CR-6833', 'CR-2899', 'CR-5997'], 'court_fee': [100, 150, 200, 100, 200], 'jurisdiction': ['BOSTON', 'houston', 'chicago', 'BOSTON', 'houston']})\\\\n\\\\nsource_df_2 = pd.DataFrame({'Date_of_Case': ['2023/05/12', '2023/04/20', '2023/02/10', '2023/03/16', '2023/06/15'], 'Fee': [100, 150, 200, 100, 200], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseNumber': ['CASE-8206', 'CASE-4328', 'CASE-1915', 'CASE-4283', 'CASE-7732'], 'CaseKind': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'Location': ['BOST', 'HOUST', 'CHIC', 'BOSTO', 'HOUST']})\\\\n\\\\ntemplate_df = pd.DataFrame({'CaseDate': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseType': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'CaseID': ['CASE-6761', 'CASE-6089', 'CASE-9565', 'CASE-6222', 'CASE-2702'], 'Fee': [100, 150, 200, 100, 200], 'Jurisdiction': ['Boston', 'Houston', 'Chicago', 'Boston', 'Houston']})\\\\n\\\\nsource_df_1.head().to_markdown()\\\"\\n}\"\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" ]\n",
" ],\n",
" \"llm_output\": {\n",
" \"token_usage\": {\n",
" \"prompt_tokens\": 932,\n",
" \"completion_tokens\": 599,\n",
" \"total_tokens\": 1531\n",
" },\n",
" \"model_name\": \"gpt-3.5-turbo-0613\"\n",
" },\n",
" \"run\": null\n",
"}\n",
"\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl] Entering Tool run with input:\n",
"\u001b[0m\"{'query': \"import pandas as pd\\n\\nsource_df_1 = pd.DataFrame({'case_date': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'lastname': ['Kim', 'Lee', 'Smith', 'Patel', 'Ivanov'], 'firstname': ['Miguel', 'John', 'Dmitri', 'Dmitri', 'Jane'], 'case_type': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'case_id': ['CR-1095', 'CR-8597', 'CR-6833', 'CR-2899', 'CR-5997'], 'court_fee': [100, 150, 200, 100, 200], 'jurisdiction': ['BOSTON', 'houston', 'chicago', 'BOSTON', 'houston']})\\n\\nsource_df_2 = pd.DataFrame({'Date_of_Case': ['2023/05/12', '2023/04/20', '2023/02/10', '2023/03/16', '2023/06/15'], 'Fee': [100, 150, 200, 100, 200], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseNumber': ['CASE-8206', 'CASE-4328', 'CASE-1915', 'CASE-4283', 'CASE-7732'], 'CaseKind': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'Location': ['BOST', 'HOUST', 'CHIC', 'BOSTO', 'HOUST']})\\n\\ntemplate_df = pd.DataFrame({'CaseDate': ['2023-05-12', '2023-04-20', '2023-02-10', '2023-03-16', '2023-06-15'], 'FullName': ['Miguel Kim', 'John Lee', 'Dmitri Smith', 'Dmitri Patel', 'Jane Ivanov'], 'CaseType': ['Civil', 'Criminl', 'Criminal', 'Criminal', 'Family'], 'CaseID': ['CASE-6761', 'CASE-6089', 'CASE-9565', 'CASE-6222', 'CASE-2702'], 'Fee': [100, 150, 200, 100, 200], 'Jurisdiction': ['Boston', 'Houston', 'Chicago', 'Boston', 'Houston']})\\n\\nsource_df_1.head().to_markdown()\"}\"\n",
"\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 3:tool:python_repl] [7ms] Exiting Tool run with output:\n",
"\u001b[0m\"| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\n",
"|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n",
"| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\n",
"| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\n",
"| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\n",
"| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\n",
"| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\"\n",
"\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] Entering LLM run with input:\n",
"\u001b[0m{\n",
" \"prompts\": [\n",
" \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n</source_df_1>\\n\\n<source_df_2>\\n| | Date_of_Case | Fee | FullName | CaseNumber | CaseKind | Location |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n| 0 | 2023/05/12 | 100 | Miguel Kim | CASE-8206 | Civil | BOST |\\n| 1 | 2023/04/20 | 150 | John Lee | CASE-4328 | Criminl | HOUST |\\n| 2 | 2023/02/10 | 200 | Dmitri Smith | CASE-1915 | Criminal | CHIC |\\n| 3 | 2023/03/16 | 100 | Dmitri Patel | CASE-4283 | Criminal | BOSTO |\\n| 4 | 2023/06/15 | 200 | Jane Ivanov | CASE-7732 | Family | HOUST |\\n</source_df_2>\\n\\n<template_df>\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n \\\"query\\\": \\\"import pandas as pd\\\\\\\\n\\\\\\\\nsource_df_1 = pd.DataFrame({\\\\'case_date\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'lastname\\\\': [\\\\'Kim\\\\', \\\\'Lee\\\\', \\\\'Smith\\\\', \\\\'Patel\\\\', \\\\'Ivanov\\\\'], \\\\'firstname\\\\': [\\\\'Miguel\\\\', \\\\'John\\\\', \\\\'Dmitri\\\\', \\\\'Dmitri\\\\', \\\\'Jane\\\\'], \\\\'case_type\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'case_id\\\\': [\\\\'CR-1095\\\\', \\\\'CR-8597\\\\', \\\\'CR-6833\\\\', \\\\'CR-2899\\\\', \\\\'CR-5997\\\\'], \\\\'court_fee\\\\': [100, 150, 200, 100, 200], \\\\'jurisdiction\\\\': [\\\\'BOSTON\\\\', \\\\'houston\\\\', \\\\'chicago\\\\', \\\\'BOSTON\\\\', \\\\'houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_2 = pd.DataFrame({\\\\'Date_of_Case\\\\': [\\\\'2023/05/12\\\\', \\\\'2023/04/20\\\\', \\\\'2023/02/10\\\\', \\\\'2023/03/16\\\\', \\\\'2023/06/15\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseNumber\\\\': [\\\\'CASE-8206\\\\', \\\\'CASE-4328\\\\', \\\\'CASE-1915\\\\', \\\\'CASE-4283\\\\', \\\\'CASE-7732\\\\'], \\\\'CaseKind\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'Location\\\\': [\\\\'BOST\\\\', \\\\'HOUST\\\\', \\\\'CHIC\\\\', \\\\'BOSTO\\\\', \\\\'HOUST\\\\']})\\\\\\\\n\\\\\\\\ntemplate_df = pd.DataFrame({\\\\'CaseDate\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseType\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'CaseID\\\\': [\\\\'CASE-6761\\\\', \\\\'CASE-6089\\\\', \\\\'CASE-9565\\\\', \\\\'CASE-6222\\\\', \\\\'CASE-2702\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'Jurisdiction\\\\': [\\\\'Boston\\\\', \\\\'Houston\\\\', \\\\'Chicago\\\\', \\\\'Boston\\\\', \\\\'Houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_1.head().to_markdown()\\\"\\\\n}'}\\nFunction: | | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\nHuman: What are the key differences between the dataframe schemas?\"\n",
" ]\n",
"}\n",
"\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:llm:ChatOpenAI] [1.18s] Exiting LLM run with output:\n",
"\u001b[0m{\n",
" \"generations\": [\n",
" [\n",
" {\n",
" \"text\": \"\",\n",
" \"generation_info\": {\n",
" \"finish_reason\": \"function_call\"\n",
" },\n",
" \"message\": {\n",
" \"lc\": 1,\n",
" \"type\": \"constructor\",\n",
" \"id\": [\n",
" \"langchain\",\n",
" \"schema\",\n",
" \"messages\",\n",
" \"AIMessage\"\n",
" ],\n",
" \"kwargs\": {\n",
" \"content\": \"\",\n",
" \"additional_kwargs\": {\n",
" \"function_call\": {\n",
" \"name\": \"python_repl\",\n",
" \"arguments\": \"{\\n \\\"query\\\": \\\"set(source_df_1.columns) - set(template_df.columns)\\\"\\n}\"\n",
" }\n",
" }\n",
" }\n",
" }\n",
" }\n",
" ]\n",
" ],\n",
" \"llm_output\": {\n",
" \"token_usage\": {\n",
" \"prompt_tokens\": 1784,\n",
" \"completion_tokens\": 27,\n",
" \"total_tokens\": 1811\n",
" },\n",
" \"model_name\": \"gpt-3.5-turbo-0613\"\n",
" },\n",
" \"run\": null\n",
"}\n",
"\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl] Entering Tool run with input:\n",
"\u001b[0m\"{'query': 'set(source_df_1.columns) - set(template_df.columns)'}\"\n",
"\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:tool:python_repl] [0ms] Exiting Tool run with output:\n",
"\u001b[0m\"{'case_id', 'firstname', 'court_fee', 'case_type', 'lastname', 'case_date', 'jurisdiction'}\"\n",
"\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] Entering LLM run with input:\n",
"\u001b[0m{\n",
" \"prompts\": [\n",
" \"System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\\n\\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\\n\\n<source_df_1>\\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\n</source_df_1>\\n\\n<source_df_2>\\n| | Date_of_Case | Fee | FullName | CaseNumber | CaseKind | Location |\\n|---:|:---------------|------:|:-------------|:-------------|:-----------|:-----------|\\n| 0 | 2023/05/12 | 100 | Miguel Kim | CASE-8206 | Civil | BOST |\\n| 1 | 2023/04/20 | 150 | John Lee | CASE-4328 | Criminl | HOUST |\\n| 2 | 2023/02/10 | 200 | Dmitri Smith | CASE-1915 | Criminal | CHIC |\\n| 3 | 2023/03/16 | 100 | Dmitri Patel | CASE-4283 | Criminal | BOSTO |\\n| 4 | 2023/06/15 | 200 | Jane Ivanov | CASE-7732 | Family | HOUST |\\n</source_df_2>\\n\\n<template_df>\\n| | CaseDate | FullName | CaseType | CaseID | Fee | Jurisdiction |\\n|---:|:-----------|:-------------|:-----------|:----------|------:|:---------------|\\n| 0 | 2023-05-12 | Miguel Kim | Civil | CASE-6761 | 100 | Boston |\\n| 1 | 2023-04-20 | John Lee | Criminl | CASE-6089 | 150 | Houston |\\n| 2 | 2023-02-10 | Dmitri Smith | Criminal | CASE-9565 | 200 | Chicago |\\n| 3 | 2023-03-16 | Dmitri Patel | Criminal | CASE-6222 | 100 | Boston |\\n| 4 | 2023-06-15 | Jane Ivanov | Family | CASE-2702 | 200 | Houston |\\n</template_df>\\nYou can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.\\n\\nPlease work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.\\n\\nBe precise, analytical, thorough.\\n\\nHere is a history of the conversation with the user so far:\\n\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n \\\"query\\\": \\\"import pandas as pd\\\\\\\\n\\\\\\\\nsource_df_1 = pd.DataFrame({\\\\'case_date\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'lastname\\\\': [\\\\'Kim\\\\', \\\\'Lee\\\\', \\\\'Smith\\\\', \\\\'Patel\\\\', \\\\'Ivanov\\\\'], \\\\'firstname\\\\': [\\\\'Miguel\\\\', \\\\'John\\\\', \\\\'Dmitri\\\\', \\\\'Dmitri\\\\', \\\\'Jane\\\\'], \\\\'case_type\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'case_id\\\\': [\\\\'CR-1095\\\\', \\\\'CR-8597\\\\', \\\\'CR-6833\\\\', \\\\'CR-2899\\\\', \\\\'CR-5997\\\\'], \\\\'court_fee\\\\': [100, 150, 200, 100, 200], \\\\'jurisdiction\\\\': [\\\\'BOSTON\\\\', \\\\'houston\\\\', \\\\'chicago\\\\', \\\\'BOSTON\\\\', \\\\'houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_2 = pd.DataFrame({\\\\'Date_of_Case\\\\': [\\\\'2023/05/12\\\\', \\\\'2023/04/20\\\\', \\\\'2023/02/10\\\\', \\\\'2023/03/16\\\\', \\\\'2023/06/15\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseNumber\\\\': [\\\\'CASE-8206\\\\', \\\\'CASE-4328\\\\', \\\\'CASE-1915\\\\', \\\\'CASE-4283\\\\', \\\\'CASE-7732\\\\'], \\\\'CaseKind\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'Location\\\\': [\\\\'BOST\\\\', \\\\'HOUST\\\\', \\\\'CHIC\\\\', \\\\'BOSTO\\\\', \\\\'HOUST\\\\']})\\\\\\\\n\\\\\\\\ntemplate_df = pd.DataFrame({\\\\'CaseDate\\\\': [\\\\'2023-05-12\\\\', \\\\'2023-04-20\\\\', \\\\'2023-02-10\\\\', \\\\'2023-03-16\\\\', \\\\'2023-06-15\\\\'], \\\\'FullName\\\\': [\\\\'Miguel Kim\\\\', \\\\'John Lee\\\\', \\\\'Dmitri Smith\\\\', \\\\'Dmitri Patel\\\\', \\\\'Jane Ivanov\\\\'], \\\\'CaseType\\\\': [\\\\'Civil\\\\', \\\\'Criminl\\\\', \\\\'Criminal\\\\', \\\\'Criminal\\\\', \\\\'Family\\\\'], \\\\'CaseID\\\\': [\\\\'CASE-6761\\\\', \\\\'CASE-6089\\\\', \\\\'CASE-9565\\\\', \\\\'CASE-6222\\\\', \\\\'CASE-2702\\\\'], \\\\'Fee\\\\': [100, 150, 200, 100, 200], \\\\'Jurisdiction\\\\': [\\\\'Boston\\\\', \\\\'Houston\\\\', \\\\'Chicago\\\\', \\\\'Boston\\\\', \\\\'Houston\\\\']})\\\\\\\\n\\\\\\\\nsource_df_1.head().to_markdown()\\\"\\\\n}'}\\nFunction: | | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\\n| 1 | 2023-04-20 | Lee | John | Criminl | CR-8597 | 150 | houston |\\n| 2 | 2023-02-10 | Smith | Dmitri | Criminal | CR-6833 | 200 | chicago |\\n| 3 | 2023-03-16 | Patel | Dmitri | Criminal | CR-2899 | 100 | BOSTON |\\n| 4 | 2023-06-15 | Ivanov | Jane | Family | CR-5997 | 200 | houston |\\nAI: {'name': 'python_repl', 'arguments': '{\\\\n \\\"query\\\": \\\"set(source_df_1.columns) - set(template_df.columns)\\\"\\\\n}'}\\nFunction: {'case_id', 'firstname', 'court_fee', 'case_type', 'lastname', 'case_date', 'jurisdiction'}\\nHuman: What are the key differences between the dataframe schemas?\"\n",
" ]\n",
"}\n",
"\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 6:llm:ChatOpenAI] [8.40s] Exiting LLM run with output:\n",
"\u001b[0m{\n",
" \"generations\": [\n",
" [\n",
" {\n",
" \"text\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\",\n",
" \"generation_info\": {\n",
" \"finish_reason\": \"stop\"\n",
" },\n",
" \"message\": {\n",
" \"lc\": 1,\n",
" \"type\": \"constructor\",\n",
" \"id\": [\n",
" \"langchain\",\n",
" \"schema\",\n",
" \"messages\",\n",
" \"AIMessage\"\n",
" ],\n",
" \"kwargs\": {\n",
" \"content\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\",\n",
" \"additional_kwargs\": {}\n",
" }\n",
" }\n",
" }\n",
" ]\n",
" ],\n",
" \"llm_output\": {\n",
" \"token_usage\": {\n",
" \"prompt_tokens\": 1846,\n",
" \"completion_tokens\": 271,\n",
" \"total_tokens\": 2117\n",
" },\n",
" \"model_name\": \"gpt-3.5-turbo-0613\"\n",
" },\n",
" \"run\": null\n",
"}\n",
"\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor] [26.19s] Exiting Chain run with output:\n",
"\u001b[0m{\n",
" \"output\": \"The key differences between the dataframe schemas are as follows:\\n\\n1. The column names in `source_df_1` are different from the column names in `template_df`. The column names in `source_df_1` are: `case_date`, `lastname`, `firstname`, `case_type`, `case_id`, `court_fee`, and `jurisdiction`. The corresponding column names in `template_df` are: `CaseDate`, `FullName`, `CaseType`, `CaseID`, `Fee`, and `Jurisdiction`.\\n\\n2. The order of the columns is different between `source_df_1` and `template_df`.\\n\\n3. The values in the `case_date` column of `source_df_1` are in the format 'YYYY-MM-DD', while the values in the `CaseDate` column of `template_df` are in the format 'YYYY-MM-DD'.\\n\\n4. The values in the `court_fee` column of `source_df_1` are integers, while the values in the `Fee` column of `template_df` are also integers.\\n\\n5. The values in the `jurisdiction` column of `source_df_1` are in uppercase, while the values in the `Jurisdiction` column of `template_df` are in title case.\\n\\nThese are the key differences between the dataframe schemas.\"\n",
"}\n"
]
}
],
"source": [
"question = \"What are the key differences between the dataframe schemas?\"\n",
"res = agent_executor.run(input=question, chat_history=memory.chat_memory.messages)\n",
"memory.chat_memory.add_user_message(question)\n",
"memory.chat_memory.add_ai_message(res)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_differences_between_dataframes\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|