andymbryant commited on
Commit
eb94e92
·
1 Parent(s): faeec87

black formatting

Browse files
Files changed (6) hide show
  1. app.py +89 -22
  2. src/core.py +42 -19
  3. src/notebooks/output.py +1 -1
  4. src/prompt.py +4 -4
  5. src/types.py +24 -7
  6. transformation_code.py +4 -4
app.py CHANGED
@@ -1,81 +1,148 @@
1
  import gradio as gr
2
- from src.core import get_table_mapping, transform_source, process_csv_text, generate_mapping_code
 
 
 
 
 
3
 
4
  MAX_ROWS = 10
5
 
 
6
  def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
7
  return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
8
 
 
9
  # TODO: use tempfile
10
  def export_csv(df, filename):
11
  df.to_csv(filename, index=False)
12
  return gr.File.update(value=filename, visible=True)
13
 
 
14
  # TODO: use tempfile
15
  def export_text(val, filename):
16
  with open(filename, "w") as f:
17
  f.write(val)
18
  return gr.File.update(value=filename, visible=True)
19
 
 
20
  with gr.Blocks() as demo:
21
- gr.Markdown("# LLM Data Mapper\nThis is a LacThis is a demo of the LangChain platform. It is a tool for generating python code from natural language prompts. This demo is a simple ETL pipeline, where you upload a source CSV and a template CSV, and then generate python code to transform the source CSV into the template CSV. This is a simple example, but the platform can be used for much more complex tasks, such as generating python code from a natural language specification document.")
 
 
22
  # STEP 1
23
- generate_step_markdown(1, "Upload a Template CSV and a Source CSV.", "The schema will be extracted from the template file and the source file will be transformed to match the schema.")
 
 
 
 
24
  with gr.Row():
25
  with gr.Column():
26
- upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
 
 
 
 
 
27
  template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
28
- upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
 
 
29
  with gr.Column():
30
- upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
 
 
 
 
 
31
  source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
32
- upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
33
-
 
 
34
  # STEP 2
35
- generate_step_markdown(2, "Generate mapping from Source to Template.", "Once generated, you can edit the values directly in the table below and they will be incorporated into the mapping logic.")
 
 
 
 
36
  with gr.Row():
37
  generate_mapping_btn = gr.Button(value="Generate Mapping", variant="primary")
38
  with gr.Row():
39
  table_mapping_df = gr.DataFrame(max_rows=MAX_ROWS, interactive=True)
40
- generate_mapping_btn.click(fn=get_table_mapping, inputs=[source_df, template_df], outputs=[table_mapping_df])
41
-
 
 
 
 
42
  with gr.Row():
43
  save_mapping_btn = gr.Button(value="Save Mapping", variant="secondary")
44
  with gr.Row():
45
  csv = gr.File(interactive=False, visible=False)
46
- save_mapping_btn.click(lambda df: export_csv(df, "source_template_mapping.csv"), table_mapping_df, csv)
 
 
 
 
47
  mapping_file = gr.File(label="Downloaded File", visible=False)
48
  mapping_file.change(lambda x: x, mapping_file, table_mapping_df)
49
-
50
  # STEP 3
51
- generate_step_markdown(3, "Generate python code to transform Source to Template, using the generated mapping.", "Once generated, you can edit the code directly in the code block below and it will be incorporated into the transformation logic. And this is re-runnable! Update the mapping logic above to try it out.")
 
 
 
 
52
  with gr.Row():
53
- generate_code_btn = gr.Button(value="Generate Code from Mapping", variant="primary")
 
 
54
  with gr.Row():
55
  code_block = gr.Code(language="python")
56
- generate_code_btn.click(fn=generate_mapping_code, inputs=[table_mapping_df], outputs=[code_block])
 
 
57
 
58
  with gr.Row():
59
  save_code_btn = gr.Button(value="Save Code", variant="secondary")
60
  with gr.Row():
61
  text = gr.File(interactive=False, visible=False)
62
- save_code_btn.click(lambda txt: export_text(txt, "transformation_code.py"), code_block, text)
 
 
63
  code_file = gr.File(label="Downloaded File", visible=False)
64
  code_file.change(lambda x: x, code_file, code_block)
65
 
66
  # STEP 4
67
- generate_step_markdown(4, "Transform the Source CSV into the Template CSV using the generated code.", "And this is re-runnable! Update the logic above to try it out.")
 
 
 
 
68
  with gr.Row():
69
  transform_btn = gr.Button(value="Transform Source", variant="primary")
70
  with gr.Row():
71
- source_df_transformed = gr.Dataframe(label="Source (transformed)", max_rows=MAX_ROWS)
72
- transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
 
 
 
 
 
 
73
 
74
  with gr.Row():
75
- save_transformed_source_btn = gr.Button(value="Save Transformed Source", variant="secondary")
 
 
76
  with gr.Row():
77
  csv = gr.File(interactive=False, visible=False)
78
- save_transformed_source_btn.click(lambda df: export_csv(df, "transformed_source.csv"), source_df_transformed, csv)
 
 
 
 
79
  transform_file = gr.File(label="Downloaded File", visible=False)
80
  transform_file.change(lambda x: x, transform_file, source_df_transformed)
81
 
 
1
  import gradio as gr
2
+ from src.core import (
3
+ get_table_mapping,
4
+ transform_source,
5
+ process_csv_text,
6
+ generate_mapping_code,
7
+ )
8
 
9
  MAX_ROWS = 10
10
 
11
+
12
  def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
13
  return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
14
 
15
+
16
  # TODO: use tempfile
17
  def export_csv(df, filename):
18
  df.to_csv(filename, index=False)
19
  return gr.File.update(value=filename, visible=True)
20
 
21
+
22
  # TODO: use tempfile
23
  def export_text(val, filename):
24
  with open(filename, "w") as f:
25
  f.write(val)
26
  return gr.File.update(value=filename, visible=True)
27
 
28
+
29
  with gr.Blocks() as demo:
30
+ gr.Markdown(
31
+ "# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
32
+ )
33
  # STEP 1
34
+ generate_step_markdown(
35
+ 1,
36
+ "Upload a Template CSV and a Source CSV.",
37
+ "The schema will be extracted from the template file and the source file will be transformed to match the schema.",
38
+ )
39
  with gr.Row():
40
  with gr.Column():
41
+ upload_template_btn = gr.UploadButton(
42
+ label="Upload Template File",
43
+ file_types=[".csv"],
44
+ live=True,
45
+ file_count="single",
46
+ )
47
  template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
48
+ upload_template_btn.upload(
49
+ fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
50
+ )
51
  with gr.Column():
52
+ upload_source_button = gr.UploadButton(
53
+ label="Upload Source File",
54
+ file_types=[".csv"],
55
+ live=True,
56
+ file_count="single",
57
+ )
58
  source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
59
+ upload_source_button.upload(
60
+ fn=process_csv_text, inputs=upload_source_button, outputs=source_df
61
+ )
62
+
63
  # STEP 2
64
+ generate_step_markdown(
65
+ 2,
66
+ "Generate mapping from Source to Template.",
67
+ "Once generated, you can edit the values directly in the table below and they will be incorporated into the mapping logic.",
68
+ )
69
  with gr.Row():
70
  generate_mapping_btn = gr.Button(value="Generate Mapping", variant="primary")
71
  with gr.Row():
72
  table_mapping_df = gr.DataFrame(max_rows=MAX_ROWS, interactive=True)
73
+ generate_mapping_btn.click(
74
+ fn=get_table_mapping,
75
+ inputs=[source_df, template_df],
76
+ outputs=[table_mapping_df],
77
+ )
78
+
79
  with gr.Row():
80
  save_mapping_btn = gr.Button(value="Save Mapping", variant="secondary")
81
  with gr.Row():
82
  csv = gr.File(interactive=False, visible=False)
83
+ save_mapping_btn.click(
84
+ lambda df: export_csv(df, "source_template_mapping.csv"),
85
+ table_mapping_df,
86
+ csv,
87
+ )
88
  mapping_file = gr.File(label="Downloaded File", visible=False)
89
  mapping_file.change(lambda x: x, mapping_file, table_mapping_df)
90
+
91
  # STEP 3
92
+ generate_step_markdown(
93
+ 3,
94
+ "Generate python code to transform Source to Template, using the generated mapping.",
95
+ "Once generated, you can edit the code directly in the code block below and it will be incorporated into the transformation logic. And this is re-runnable! Update the mapping logic above to try it out.",
96
+ )
97
  with gr.Row():
98
+ generate_code_btn = gr.Button(
99
+ value="Generate Code from Mapping", variant="primary"
100
+ )
101
  with gr.Row():
102
  code_block = gr.Code(language="python")
103
+ generate_code_btn.click(
104
+ fn=generate_mapping_code, inputs=[table_mapping_df], outputs=[code_block]
105
+ )
106
 
107
  with gr.Row():
108
  save_code_btn = gr.Button(value="Save Code", variant="secondary")
109
  with gr.Row():
110
  text = gr.File(interactive=False, visible=False)
111
+ save_code_btn.click(
112
+ lambda txt: export_text(txt, "transformation_code.py"), code_block, text
113
+ )
114
  code_file = gr.File(label="Downloaded File", visible=False)
115
  code_file.change(lambda x: x, code_file, code_block)
116
 
117
  # STEP 4
118
+ generate_step_markdown(
119
+ 4,
120
+ "Transform the Source CSV into the Template CSV using the generated code.",
121
+ "And this is re-runnable! Update the logic above to try it out.",
122
+ )
123
  with gr.Row():
124
  transform_btn = gr.Button(value="Transform Source", variant="primary")
125
  with gr.Row():
126
+ source_df_transformed = gr.Dataframe(
127
+ label="Source (transformed)", max_rows=MAX_ROWS
128
+ )
129
+ transform_btn.click(
130
+ transform_source,
131
+ inputs=[source_df, code_block],
132
+ outputs=[source_df_transformed],
133
+ )
134
 
135
  with gr.Row():
136
+ save_transformed_source_btn = gr.Button(
137
+ value="Save Transformed Source", variant="secondary"
138
+ )
139
  with gr.Row():
140
  csv = gr.File(interactive=False, visible=False)
141
+ save_transformed_source_btn.click(
142
+ lambda df: export_csv(df, "transformed_source.csv"),
143
+ source_df_transformed,
144
+ csv,
145
+ )
146
  transform_file = gr.File(label="Downloaded File", visible=False)
147
  transform_file.change(lambda x: x, transform_file, source_df_transformed)
148
 
src/core.py CHANGED
@@ -10,55 +10,78 @@ from langchain.schema.output_parser import StrOutputParser
10
  from langchain.chat_models import ChatOpenAI
11
  from src.types import TableMapping
12
  from src.vars import NUM_ROWS_TO_RETURN
13
- from src.prompt import DATA_SCIENTIST_PROMPT_STR, SPEC_WRITER_PROMPT_STR, ENGINEER_PROMPT_STR
 
 
 
 
14
 
15
  load_dotenv()
16
 
17
- DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
18
- SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
19
 
20
  # TODO: consider different models for different prompts, e.g. natural language prompt might be better with higher temperature
21
  BASE_MODEL = ChatOpenAI(
22
- model_name='gpt-4',
23
  temperature=0,
24
  )
25
 
 
26
  def _get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
27
- return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
 
28
 
29
  def get_table_mapping(source_df, template_df):
30
- '''Use PydanticOutputParser to parse the output of the Data Scientist prompt into a TableMapping object.'''
31
  table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
32
  analyst_prompt = ChatPromptTemplate.from_template(
33
- template=DATA_SCIENTIST_PROMPT_STR,
34
- partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
 
 
35
  )
36
  mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
37
- table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": _get_data_str_from_df_for_prompt(source_df), "target_csv_str": _get_data_str_from_df_for_prompt(template_df)})
38
- return pd.DataFrame(table_mapping.dict()['table_mappings'])
 
 
 
 
 
 
39
 
40
  def _sanitize_python_output(text: str):
41
- '''Remove markdown from python code, as prompt returns it.'''
42
  _, after = text.split("```python")
43
  return after.split("```")[0]
44
 
 
45
  def generate_mapping_code(table_mapping_df) -> str:
46
- '''Chain two prompts together to generate python code from a table mapping: 1. technical spec writer, 2. python engineer'''
47
  writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
48
  engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
49
-
50
  writer_chain = writer_prompt | BASE_MODEL | StrOutputParser()
51
- engineer_chain = {"spec_str": writer_chain} | engineer_prompt | BASE_MODEL | StrOutputParser() | _sanitize_python_output
 
 
 
 
 
 
52
  return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
53
 
 
54
  def process_csv_text(temp_file):
55
- '''Process a CSV file into a dataframe, either from a string or a file.'''
56
  if isinstance(temp_file, str):
57
- df = pd.read_csv(io.StringIO(temp_file))
58
  else:
59
- df = pd.read_csv(temp_file.name)
60
  return df
61
 
 
62
  def transform_source(source_df, code_text: str):
63
- '''Use PythonAstREPLTool to transform a source dataframe using python code.'''
64
- return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)
 
10
  from langchain.chat_models import ChatOpenAI
11
  from src.types import TableMapping
12
  from src.vars import NUM_ROWS_TO_RETURN
13
+ from src.prompt import (
14
+ DATA_SCIENTIST_PROMPT_STR,
15
+ SPEC_WRITER_PROMPT_STR,
16
+ ENGINEER_PROMPT_STR,
17
+ )
18
 
19
  load_dotenv()
20
 
21
+ DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), "data")
22
+ SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, "synthetic")
23
 
24
  # TODO: consider different models for different prompts, e.g. natural language prompt might be better with higher temperature
25
  BASE_MODEL = ChatOpenAI(
26
+ model_name="gpt-4",
27
  temperature=0,
28
  )
29
 
30
+
31
  def _get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
32
+ return f"<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>"
33
+
34
 
35
  def get_table_mapping(source_df, template_df):
36
+ """Use PydanticOutputParser to parse the output of the Data Scientist prompt into a TableMapping object."""
37
  table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
38
  analyst_prompt = ChatPromptTemplate.from_template(
39
+ template=DATA_SCIENTIST_PROMPT_STR,
40
+ partial_variables={
41
+ "format_instructions": table_mapping_parser.get_format_instructions()
42
+ },
43
  )
44
  mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
45
+ table_mapping: TableMapping = mapping_chain.invoke(
46
+ {
47
+ "source_1_csv_str": _get_data_str_from_df_for_prompt(source_df),
48
+ "target_csv_str": _get_data_str_from_df_for_prompt(template_df),
49
+ }
50
+ )
51
+ return pd.DataFrame(table_mapping.dict()["table_mappings"])
52
+
53
 
54
  def _sanitize_python_output(text: str):
55
+ """Remove markdown from python code, as prompt returns it."""
56
  _, after = text.split("```python")
57
  return after.split("```")[0]
58
 
59
+
60
  def generate_mapping_code(table_mapping_df) -> str:
61
+ """Chain two prompts together to generate python code from a table mapping: 1. technical spec writer, 2. python engineer"""
62
  writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
63
  engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
64
+
65
  writer_chain = writer_prompt | BASE_MODEL | StrOutputParser()
66
+ engineer_chain = (
67
+ {"spec_str": writer_chain}
68
+ | engineer_prompt
69
+ | BASE_MODEL
70
+ | StrOutputParser()
71
+ | _sanitize_python_output
72
+ )
73
  return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
74
 
75
+
76
  def process_csv_text(temp_file):
77
+ """Process a CSV file into a dataframe, either from a string or a file."""
78
  if isinstance(temp_file, str):
79
+ df = pd.read_csv(io.StringIO(temp_file))
80
  else:
81
+ df = pd.read_csv(temp_file.name)
82
  return df
83
 
84
+
85
  def transform_source(source_df, code_text: str):
86
+ """Use PythonAstREPLTool to transform a source dataframe using python code."""
87
+ return PythonAstREPLTool(locals={"source_df": source_df}).run(code_text)
src/notebooks/output.py CHANGED
@@ -1 +1 @@
1
- # check this out
 
1
+ # check this out
src/prompt.py CHANGED
@@ -1,4 +1,4 @@
1
- DATA_SCIENTIST_PROMPT_STR = '''
2
  You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.
3
 
4
  Head of `source_csv`:
@@ -35,10 +35,10 @@ Remember:
35
  - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation
36
 
37
  Your response:
38
- '''
39
 
40
 
41
- SPEC_WRITER_PROMPT_STR = '''
42
  You are a product manager and technical writer for a software firm. Your task is to draft a specification document for a software engineer to design a component within an ETL pipeline, converting `source_df` to `target_df` using the provided mapping:
43
 
44
  {table_mapping}
@@ -63,7 +63,7 @@ This will be your only communication to the engineer. Ensure it's:
63
  - **Precise**: Be unambiguous and exact.
64
 
65
  Your response:
66
- '''
67
 
68
 
69
  ENGINEER_PROMPT_STR = '''
 
1
+ DATA_SCIENTIST_PROMPT_STR = """
2
  You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.
3
 
4
  Head of `source_csv`:
 
35
  - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation
36
 
37
  Your response:
38
+ """
39
 
40
 
41
+ SPEC_WRITER_PROMPT_STR = """
42
  You are a product manager and technical writer for a software firm. Your task is to draft a specification document for a software engineer to design a component within an ETL pipeline, converting `source_df` to `target_df` using the provided mapping:
43
 
44
  {table_mapping}
 
63
  - **Precise**: Be unambiguous and exact.
64
 
65
  Your response:
66
+ """
67
 
68
 
69
  ENGINEER_PROMPT_STR = '''
src/types.py CHANGED
@@ -1,12 +1,29 @@
1
  from pydantic import BaseModel, Field
2
 
 
3
  class TableMappingEntry(BaseModel):
4
- '''A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations.'''
5
- source_column_name: str = Field(..., description="Name of the column in the source table.")
6
- target_column_name: str = Field(..., description="Name of the column in the target table, to which the source column maps.")
7
- value_transformations: str = Field(..., description="Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.")
8
- explanation: str = Field(..., description="One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.")
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  class TableMapping(BaseModel):
11
- '''A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table.'''
12
- table_mappings: list[TableMappingEntry] = Field(..., description="A list of table mappings.")
 
 
 
 
1
  from pydantic import BaseModel, Field
2
 
3
+
4
  class TableMappingEntry(BaseModel):
5
+ """A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations."""
6
+
7
+ source_column_name: str = Field(
8
+ ..., description="Name of the column in the source table."
9
+ )
10
+ target_column_name: str = Field(
11
+ ...,
12
+ description="Name of the column in the target table, to which the source column maps.",
13
+ )
14
+ value_transformations: str = Field(
15
+ ...,
16
+ description="Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.",
17
+ )
18
+ explanation: str = Field(
19
+ ...,
20
+ description="One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.",
21
+ )
22
+
23
 
24
  class TableMapping(BaseModel):
25
+ """A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table."""
26
+
27
+ table_mappings: list[TableMappingEntry] = Field(
28
+ ..., description="A list of table mappings."
29
+ )
transformation_code.py CHANGED
@@ -4,13 +4,13 @@ import pandas as pd
4
  target_df = pd.DataFrame()
5
 
6
  # Copy the 'case_date' column from source_df to the 'CaseDate' column in target_df without any transformation
7
- target_df['CaseDate'] = source_df['case_date']
8
 
9
  # Concatenate the 'firstname' and 'lastname' columns from source_df (with a space in between) and store the result in the 'FullName' column in target_df
10
- target_df['FullName'] = source_df['firstname'] + " " + source_df['lastname']
11
 
12
  # Copy the 'case_type' column from source_df to the 'CaseType' column in target_df without any transformation
13
- target_df['CaseType'] = source_df['case_type']
14
 
15
  # Return the target_df as the output of the script
16
- target_df
 
4
  target_df = pd.DataFrame()
5
 
6
  # Copy the 'case_date' column from source_df to the 'CaseDate' column in target_df without any transformation
7
+ target_df["CaseDate"] = source_df["case_date"]
8
 
9
  # Concatenate the 'firstname' and 'lastname' columns from source_df (with a space in between) and store the result in the 'FullName' column in target_df
10
+ target_df["FullName"] = source_df["firstname"] + " " + source_df["lastname"]
11
 
12
  # Copy the 'case_type' column from source_df to the 'CaseType' column in target_df without any transformation
13
+ target_df["CaseType"] = source_df["case_type"]
14
 
15
  # Return the target_df as the output of the script
16
+ target_df