Spaces:

andymbryant
/

data-mapper

Runtime error

App Files Files Community

andymbryant commited on Aug 18, 2023

Commit

eb94e92

1 Parent(s): faeec87

black formatting

Browse files

Files changed (6) hide show

app.py +89 -22
src/core.py +42 -19
src/notebooks/output.py +1 -1
src/prompt.py +4 -4
src/types.py +24 -7
transformation_code.py +4 -4

app.py CHANGED Viewed

@@ -1,81 +1,148 @@
 import gradio as gr
-from src.core import get_table_mapping, transform_source, process_csv_text, generate_mapping_code
 MAX_ROWS = 10
 def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
     return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
 # TODO: use tempfile
 def export_csv(df, filename):
     df.to_csv(filename, index=False)
     return gr.File.update(value=filename, visible=True)
 # TODO: use tempfile
 def export_text(val, filename):
     with open(filename, "w") as f:
         f.write(val)
     return gr.File.update(value=filename, visible=True)
 with gr.Blocks() as demo:
-    gr.Markdown("# LLM Data Mapper\nThis is a LacThis is a demo of the LangChain platform. It is a tool for generating python code from natural language prompts. This demo is a simple ETL pipeline, where you upload a source CSV and a template CSV, and then generate python code to transform the source CSV into the template CSV. This is a simple example, but the platform can be used for much more complex tasks, such as generating python code from a natural language specification document.")
     # STEP 1
-    generate_step_markdown(1, "Upload a Template CSV and a Source CSV.", "The schema will be extracted from the template file and the source file will be transformed to match the schema.")
     with gr.Row():
         with gr.Column():
-            upload_template_btn = gr.UploadButton(label="Upload Template File", file_types = ['.csv'], live=True, file_count = "single")
             template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
-            upload_template_btn.upload(fn=process_csv_text, inputs=upload_template_btn, outputs=template_df)
         with gr.Column():
-            upload_source_button = gr.UploadButton(label="Upload Source File", file_types = ['.csv'], live=True, file_count = "single")
             source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
-            upload_source_button.upload(fn=process_csv_text, inputs=upload_source_button, outputs=source_df)
     # STEP 2
-    generate_step_markdown(2, "Generate mapping from Source to Template.", "Once generated, you can edit the values directly in the table below and they will be incorporated into the mapping logic.")
     with gr.Row():
         generate_mapping_btn = gr.Button(value="Generate Mapping", variant="primary")
     with gr.Row():
         table_mapping_df = gr.DataFrame(max_rows=MAX_ROWS, interactive=True)
-        generate_mapping_btn.click(fn=get_table_mapping, inputs=[source_df, template_df], outputs=[table_mapping_df])
     with gr.Row():
         save_mapping_btn = gr.Button(value="Save Mapping", variant="secondary")
     with gr.Row():
         csv = gr.File(interactive=False, visible=False)
-        save_mapping_btn.click(lambda df: export_csv(df, "source_template_mapping.csv"), table_mapping_df, csv)
         mapping_file = gr.File(label="Downloaded File", visible=False)
         mapping_file.change(lambda x: x, mapping_file, table_mapping_df)
     # STEP 3
-    generate_step_markdown(3, "Generate python code to transform Source to Template, using the generated mapping.", "Once generated, you can edit the code directly in the code block below and it will be incorporated into the transformation logic. And this is re-runnable! Update the mapping logic above to try it out.")
     with gr.Row():
-        generate_code_btn = gr.Button(value="Generate Code from Mapping", variant="primary")
     with gr.Row():
         code_block = gr.Code(language="python")
-        generate_code_btn.click(fn=generate_mapping_code, inputs=[table_mapping_df], outputs=[code_block])
     with gr.Row():
         save_code_btn = gr.Button(value="Save Code", variant="secondary")
     with gr.Row():
         text = gr.File(interactive=False, visible=False)
-        save_code_btn.click(lambda txt: export_text(txt, "transformation_code.py"), code_block, text)
         code_file = gr.File(label="Downloaded File", visible=False)
         code_file.change(lambda x: x, code_file, code_block)
     # STEP 4
-    generate_step_markdown(4, "Transform the Source CSV into the Template CSV using the generated code.", "And this is re-runnable! Update the logic above to try it out.")
     with gr.Row():
         transform_btn = gr.Button(value="Transform Source", variant="primary")
     with gr.Row():
-        source_df_transformed = gr.Dataframe(label="Source (transformed)", max_rows=MAX_ROWS)
-        transform_btn.click(transform_source, inputs=[source_df, code_block], outputs=[source_df_transformed])
     with gr.Row():
-        save_transformed_source_btn = gr.Button(value="Save Transformed Source", variant="secondary")
     with gr.Row():
         csv = gr.File(interactive=False, visible=False)
-        save_transformed_source_btn.click(lambda df: export_csv(df, "transformed_source.csv"), source_df_transformed, csv)
         transform_file = gr.File(label="Downloaded File", visible=False)
         transform_file.change(lambda x: x, transform_file, source_df_transformed)

 import gradio as gr
+from src.core import (
+    get_table_mapping,
+    transform_source,
+    process_csv_text,
+    generate_mapping_code,
+)
 MAX_ROWS = 10
 def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
     return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
 # TODO: use tempfile
 def export_csv(df, filename):
     df.to_csv(filename, index=False)
     return gr.File.update(value=filename, visible=True)
 # TODO: use tempfile
 def export_text(val, filename):
     with open(filename, "w") as f:
         f.write(val)
     return gr.File.update(value=filename, visible=True)
 with gr.Blocks() as demo:
+    gr.Markdown(
+        "# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
+    )
     # STEP 1
+    generate_step_markdown(
+        1,
+        "Upload a Template CSV and a Source CSV.",
+        "The schema will be extracted from the template file and the source file will be transformed to match the schema.",
+    )
     with gr.Row():
         with gr.Column():
+            upload_template_btn = gr.UploadButton(
+                label="Upload Template File",
+                file_types=[".csv"],
+                live=True,
+                file_count="single",
+            )
             template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
+            upload_template_btn.upload(
+                fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
+            )
         with gr.Column():
+            upload_source_button = gr.UploadButton(
+                label="Upload Source File",
+                file_types=[".csv"],
+                live=True,
+                file_count="single",
+            )
             source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
+            upload_source_button.upload(
+                fn=process_csv_text, inputs=upload_source_button, outputs=source_df
+            )
     # STEP 2
+    generate_step_markdown(
+        2,
+        "Generate mapping from Source to Template.",
+        "Once generated, you can edit the values directly in the table below and they will be incorporated into the mapping logic.",
+    )
     with gr.Row():
         generate_mapping_btn = gr.Button(value="Generate Mapping", variant="primary")
     with gr.Row():
         table_mapping_df = gr.DataFrame(max_rows=MAX_ROWS, interactive=True)
+        generate_mapping_btn.click(
+            fn=get_table_mapping,
+            inputs=[source_df, template_df],
+            outputs=[table_mapping_df],
+        )
     with gr.Row():
         save_mapping_btn = gr.Button(value="Save Mapping", variant="secondary")
     with gr.Row():
         csv = gr.File(interactive=False, visible=False)
+        save_mapping_btn.click(
+            lambda df: export_csv(df, "source_template_mapping.csv"),
+            table_mapping_df,
+            csv,
+        )
         mapping_file = gr.File(label="Downloaded File", visible=False)
         mapping_file.change(lambda x: x, mapping_file, table_mapping_df)
     # STEP 3
+    generate_step_markdown(
+        3,
+        "Generate python code to transform Source to Template, using the generated mapping.",
+        "Once generated, you can edit the code directly in the code block below and it will be incorporated into the transformation logic. And this is re-runnable! Update the mapping logic above to try it out.",
+    )
     with gr.Row():
+        generate_code_btn = gr.Button(
+            value="Generate Code from Mapping", variant="primary"
+        )
     with gr.Row():
         code_block = gr.Code(language="python")
+        generate_code_btn.click(
+            fn=generate_mapping_code, inputs=[table_mapping_df], outputs=[code_block]
+        )
     with gr.Row():
         save_code_btn = gr.Button(value="Save Code", variant="secondary")
     with gr.Row():
         text = gr.File(interactive=False, visible=False)
+        save_code_btn.click(
+            lambda txt: export_text(txt, "transformation_code.py"), code_block, text
+        )
         code_file = gr.File(label="Downloaded File", visible=False)
         code_file.change(lambda x: x, code_file, code_block)
     # STEP 4
+    generate_step_markdown(
+        4,
+        "Transform the Source CSV into the Template CSV using the generated code.",
+        "And this is re-runnable! Update the logic above to try it out.",
+    )
     with gr.Row():
         transform_btn = gr.Button(value="Transform Source", variant="primary")
     with gr.Row():
+        source_df_transformed = gr.Dataframe(
+            label="Source (transformed)", max_rows=MAX_ROWS
+        )
+        transform_btn.click(
+            transform_source,
+            inputs=[source_df, code_block],
+            outputs=[source_df_transformed],
+        )
     with gr.Row():
+        save_transformed_source_btn = gr.Button(
+            value="Save Transformed Source", variant="secondary"
+        )
     with gr.Row():
         csv = gr.File(interactive=False, visible=False)
+        save_transformed_source_btn.click(
+            lambda df: export_csv(df, "transformed_source.csv"),
+            source_df_transformed,
+            csv,
+        )
         transform_file = gr.File(label="Downloaded File", visible=False)
         transform_file.change(lambda x: x, transform_file, source_df_transformed)

src/core.py CHANGED Viewed

@@ -10,55 +10,78 @@ from langchain.schema.output_parser import StrOutputParser
 from langchain.chat_models import ChatOpenAI
 from src.types import TableMapping
 from src.vars import NUM_ROWS_TO_RETURN
-from src.prompt import DATA_SCIENTIST_PROMPT_STR, SPEC_WRITER_PROMPT_STR, ENGINEER_PROMPT_STR
 load_dotenv()
-DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), 'data')
-SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, 'synthetic')
 # TODO: consider different models for different prompts, e.g. natural language prompt might be better with higher temperature
 BASE_MODEL = ChatOpenAI(
-    model_name='gpt-4',
     temperature=0,
 )
 def _get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
-    return f'<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>'
 def get_table_mapping(source_df, template_df):
-    '''Use PydanticOutputParser to parse the output of the Data Scientist prompt into a TableMapping object.'''
     table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
     analyst_prompt = ChatPromptTemplate.from_template(
-        template=DATA_SCIENTIST_PROMPT_STR,
-        partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
     )
     mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
-    table_mapping: TableMapping = mapping_chain.invoke({"source_1_csv_str": _get_data_str_from_df_for_prompt(source_df), "target_csv_str": _get_data_str_from_df_for_prompt(template_df)})
-    return pd.DataFrame(table_mapping.dict()['table_mappings'])
 def _sanitize_python_output(text: str):
-    '''Remove markdown from python code, as prompt returns it.'''
     _, after = text.split("```python")
     return after.split("```")[0]
 def generate_mapping_code(table_mapping_df) -> str:
-    '''Chain two prompts together to generate python code from a table mapping: 1. technical spec writer, 2. python engineer'''
     writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
     engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
     writer_chain = writer_prompt | BASE_MODEL | StrOutputParser()
-    engineer_chain = {"spec_str": writer_chain} | engineer_prompt | BASE_MODEL | StrOutputParser() | _sanitize_python_output
     return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
 def process_csv_text(temp_file):
-    '''Process a CSV file into a dataframe, either from a string or a file.'''
     if isinstance(temp_file, str):
-      df = pd.read_csv(io.StringIO(temp_file))
     else:
-      df = pd.read_csv(temp_file.name)
     return df
 def transform_source(source_df, code_text: str):
-    '''Use PythonAstREPLTool to transform a source dataframe using python code.'''
-    return PythonAstREPLTool(locals={'source_df': source_df}).run(code_text)

 from langchain.chat_models import ChatOpenAI
 from src.types import TableMapping
 from src.vars import NUM_ROWS_TO_RETURN
+from src.prompt import (
+    DATA_SCIENTIST_PROMPT_STR,
+    SPEC_WRITER_PROMPT_STR,
+    ENGINEER_PROMPT_STR,
+)
 load_dotenv()
+DATA_DIR_PATH = os.path.join(os.path.dirname(__file__), "data")
+SYNTHETIC_DATA_DIR_PATH = os.path.join(DATA_DIR_PATH, "synthetic")
 # TODO: consider different models for different prompts, e.g. natural language prompt might be better with higher temperature
 BASE_MODEL = ChatOpenAI(
+    model_name="gpt-4",
     temperature=0,
 )
 def _get_data_str_from_df_for_prompt(df, num_rows_to_return=NUM_ROWS_TO_RETURN):
+    return f"<df>\n{df.head(num_rows_to_return).to_markdown()}\n</df>"
 def get_table_mapping(source_df, template_df):
+    """Use PydanticOutputParser to parse the output of the Data Scientist prompt into a TableMapping object."""
     table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
     analyst_prompt = ChatPromptTemplate.from_template(
+        template=DATA_SCIENTIST_PROMPT_STR,
+        partial_variables={
+            "format_instructions": table_mapping_parser.get_format_instructions()
+        },
     )
     mapping_chain = analyst_prompt | BASE_MODEL | table_mapping_parser
+    table_mapping: TableMapping = mapping_chain.invoke(
+        {
+            "source_1_csv_str": _get_data_str_from_df_for_prompt(source_df),
+            "target_csv_str": _get_data_str_from_df_for_prompt(template_df),
+        }
+    )
+    return pd.DataFrame(table_mapping.dict()["table_mappings"])
 def _sanitize_python_output(text: str):
+    """Remove markdown from python code, as prompt returns it."""
     _, after = text.split("```python")
     return after.split("```")[0]
 def generate_mapping_code(table_mapping_df) -> str:
+    """Chain two prompts together to generate python code from a table mapping: 1. technical spec writer, 2. python engineer"""
     writer_prompt = ChatPromptTemplate.from_template(SPEC_WRITER_PROMPT_STR)
     engineer_prompt = ChatPromptTemplate.from_template(ENGINEER_PROMPT_STR)
     writer_chain = writer_prompt | BASE_MODEL | StrOutputParser()
+    engineer_chain = (
+        {"spec_str": writer_chain}
+        | engineer_prompt
+        | BASE_MODEL
+        | StrOutputParser()
+        | _sanitize_python_output
+    )
     return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
 def process_csv_text(temp_file):
+    """Process a CSV file into a dataframe, either from a string or a file."""
     if isinstance(temp_file, str):
+        df = pd.read_csv(io.StringIO(temp_file))
     else:
+        df = pd.read_csv(temp_file.name)
     return df
 def transform_source(source_df, code_text: str):
+    """Use PythonAstREPLTool to transform a source dataframe using python code."""
+    return PythonAstREPLTool(locals={"source_df": source_df}).run(code_text)

src/notebooks/output.py CHANGED Viewed

	@@ -1 +1 @@
1	- # check this out


1	+ # check this out

src/prompt.py CHANGED Viewed

@@ -1,4 +1,4 @@
-DATA_SCIENTIST_PROMPT_STR = '''
 You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.
 Head of `source_csv`:
@@ -35,10 +35,10 @@ Remember:
 - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation
 Your response:
-'''
-SPEC_WRITER_PROMPT_STR = '''
 You are a product manager and technical writer for a software firm. Your task is to draft a specification document for a software engineer to design a component within an ETL pipeline, converting `source_df` to `target_df` using the provided mapping:
 {table_mapping}
@@ -63,7 +63,7 @@ This will be your only communication to the engineer. Ensure it's:
 - **Precise**: Be unambiguous and exact.
 Your response:
-'''
 ENGINEER_PROMPT_STR = '''

+DATA_SCIENTIST_PROMPT_STR = """
 You are a Data Scientist, who specializes in generating schema mappings for use by Software Engineers in ETL pipelines.
 Head of `source_csv`:
 - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation
 Your response:
+"""
+SPEC_WRITER_PROMPT_STR = """
 You are a product manager and technical writer for a software firm. Your task is to draft a specification document for a software engineer to design a component within an ETL pipeline, converting `source_df` to `target_df` using the provided mapping:
 {table_mapping}
 - **Precise**: Be unambiguous and exact.
 Your response:
+"""
 ENGINEER_PROMPT_STR = '''

src/types.py CHANGED Viewed

@@ -1,12 +1,29 @@
 from pydantic import BaseModel, Field
 class TableMappingEntry(BaseModel):
-    '''A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations.'''
-    source_column_name: str = Field(..., description="Name of the column in the source table.")
-    target_column_name: str = Field(..., description="Name of the column in the target table, to which the source column maps.")
-    value_transformations: str = Field(..., description="Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.")
-    explanation: str = Field(..., description="One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.")
 class TableMapping(BaseModel):
-    '''A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table.'''
-    table_mappings: list[TableMappingEntry] = Field(..., description="A list of table mappings.")

 from pydantic import BaseModel, Field
 class TableMappingEntry(BaseModel):
+    """A single row in a table mapping. Describes how a single column in a source table maps to a single column in a target table, including any necessary transformations, and their explanations."""
+    source_column_name: str = Field(
+        ..., description="Name of the column in the source table."
+    )
+    target_column_name: str = Field(
+        ...,
+        description="Name of the column in the target table, to which the source column maps.",
+    )
+    value_transformations: str = Field(
+        ...,
+        description="Transformations needed make the source values match the target values. If unncecessary, write 'NO_TRANSFORM'.",
+    )
+    explanation: str = Field(
+        ...,
+        description="One-sentence explanation of this row (source-target mapping/transformation). Include any information that might be relevant to a software engineer building an ETL pipeline with this document.",
+    )
 class TableMapping(BaseModel):
+    """A list of table mappings collectively describe how a source table should be transformed to match the schema of a target table."""
+    table_mappings: list[TableMappingEntry] = Field(
+        ..., description="A list of table mappings."
+    )

transformation_code.py CHANGED Viewed

@@ -4,13 +4,13 @@ import pandas as pd
 target_df = pd.DataFrame()
 # Copy the 'case_date' column from source_df to the 'CaseDate' column in target_df without any transformation
-target_df['CaseDate'] = source_df['case_date']
 # Concatenate the 'firstname' and 'lastname' columns from source_df (with a space in between) and store the result in the 'FullName' column in target_df
-target_df['FullName'] = source_df['firstname'] + " " + source_df['lastname']
 # Copy the 'case_type' column from source_df to the 'CaseType' column in target_df without any transformation
-target_df['CaseType'] = source_df['case_type']
 # Return the target_df as the output of the script
-target_df

 target_df = pd.DataFrame()
 # Copy the 'case_date' column from source_df to the 'CaseDate' column in target_df without any transformation
+target_df["CaseDate"] = source_df["case_date"]
 # Concatenate the 'firstname' and 'lastname' columns from source_df (with a space in between) and store the result in the 'FullName' column in target_df
+target_df["FullName"] = source_df["firstname"] + " " + source_df["lastname"]
 # Copy the 'case_type' column from source_df to the 'CaseType' column in target_df without any transformation
+target_df["CaseType"] = source_df["case_type"]
 # Return the target_df as the output of the script
+target_df