In [31]:
import os
import pandas as pd
import langchain
from langchain.agents import OpenAIFunctionsAgent, AgentExecutor
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import PythonAstREPLTool
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv
load_dotenv()

In [32]:
langchain.debug = True
data_dir_path = os.path.join(os.getcwd())
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)

NUM_ROWS_IN_HEAD = 5

# {dataframe_heads_str}

In [43]:
PROMPT_TEMPLATE = """You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.
The ultimate goal is to generate a mapping from the source dataframes to the target dataframe.

This is the result of running `df.head().to_markdown()` on each of the dataframes:

{dataframe_heads_str}
You can use these samples to draw conclusions about the structure of the data. Do not get more than 5 rows at a time.

Please work step by step through this process. You can make intermediate queries, validate your logic, and then move on to the next step.

Be precise, analytical, thorough.

Here is a history of the conversation with the user so far:
"""

In [44]:
class PythonInputs(BaseModel):
 query: str = Field(description="code snippet to run")

format_df_for_prompt = lambda df: f'\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\n'

entries_a_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))
entries_b_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))
template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))

df_name_to_df_map = {"source_df_1": entries_a_df, "source_df_2": entries_b_df, "template_df": template_df}

dataframe_heads_str_list: str = []
for df_name, df in df_name_to_df_map.items():
 dataframe_heads_str_list.append(f'<{df_name}>\n{df.head(NUM_ROWS_IN_HEAD).to_markdown()}\n')

prompt_template = PROMPT_TEMPLATE.format(dataframe_heads_str="\n\n".join(dataframe_heads_str_list))

prompt = ChatPromptTemplate.from_messages([
 ("system", prompt_template),
 MessagesPlaceholder(variable_name="agent_scratchpad"),
 ("human", "{input}")
])
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

repl = PythonAstREPLTool(locals=df_name_to_df_map, name="python_repl",
 description="Runs code and returns the output of the final line",
 args_schema=PythonInputs)
tools = [repl]
agent = OpenAIFunctionsAgent(llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"), prompt=prompt, tools=tools, memory=memory, handle_parsing_errors=True)
agent_executor = AgentExecutor(agent=agent, tools=tools, max_iterations=5, early_stopping_method="generate", handle_parsing_errors=True)

In [45]:
question = "What are the key differences between the dataframe schemas?"
res = agent_executor.run(input=question, chat_history=memory.chat_memory.messages)
memory.chat_memory.add_user_message(question)
memory.chat_memory.add_ai_message(res)

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
 "input": "What are the key differences between the dataframe schemas?",
 "chat_history": []
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
 "prompts": [
 "System: You are DataMapperGPT. Your job is to work with a human, who is a data engineer, to compare multiple source dataframes and map their structures to the schema of the target dataframe.\nThe ultimate goal is to generate a mapping from the source dataframes to the target dataframe.\n\nThis is the result of running `df.head().to_markdown()` on each of the dataframes:\n\n\n| | case_date | lastname | firstname | case_type | case_id | court_fee | jurisdiction |\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n| 0 | 2023-05-12 | Kim | Miguel | Civil | CR-1095 | 100 | BOSTON |\n| 1 | 2023-04-20 | Lee | John | Criminl |

In [None]:
get_differences_between_dataframes
