andymbryant commited on
Commit
88e8643
·
1 Parent(s): 8183509

added example data

Browse files
.gitignore CHANGED
@@ -2,3 +2,6 @@ venv/
2
  __pycache__/
3
  *.pyc
4
  .env
 
 
 
 
2
  __pycache__/
3
  *.pyc
4
  .env
5
+ *.json
6
+ wandb/
7
+ *.log
app.py CHANGED
@@ -8,10 +8,15 @@ from src.core import (
8
 
9
  MAX_ROWS = 10
10
 
11
-
12
  def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
13
  return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
14
 
 
 
 
 
 
15
 
16
  # TODO: use tempfile
17
  def export_csv(df, filename):
@@ -20,12 +25,13 @@ def export_csv(df, filename):
20
 
21
 
22
  # TODO: use tempfile
23
- def export_text(val, filename):
24
  with open(filename, "w") as f:
25
- f.write(val)
26
  return gr.File.update(value=filename, visible=True)
27
 
28
 
 
29
  with gr.Blocks() as demo:
30
  gr.Markdown(
31
  "# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
@@ -42,21 +48,33 @@ with gr.Blocks() as demo:
42
  label="Upload Template File",
43
  file_types=[".csv"],
44
  file_count="single",
 
 
 
 
 
45
  )
46
  template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
47
  upload_template_btn.upload(
48
  fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
49
  )
 
50
  with gr.Column():
51
  upload_source_button = gr.UploadButton(
52
  label="Upload Source File",
53
  file_types=[".csv"],
54
  file_count="single",
 
 
 
 
 
55
  )
56
  source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
57
  upload_source_button.upload(
58
  fn=process_csv_text, inputs=upload_source_button, outputs=source_df
59
  )
 
60
 
61
  # STEP 2
62
  generate_step_markdown(
 
8
 
9
  MAX_ROWS = 10
10
 
11
+ import pandas as pd
12
  def generate_step_markdown(step_number: int, subtitle: str, description: str = None):
13
  return gr.Markdown(f"# Step {step_number}\n\n ### {subtitle}\n{description}")
14
 
15
+ example_df = pd.read_csv('./src/data/synthetic/legal_entries_a.csv')
16
+ def load_example_template(template_df, example_df):
17
+ return template_df.update(example_df)
18
+
19
+
20
 
21
  # TODO: use tempfile
22
  def export_csv(df, filename):
 
25
 
26
 
27
  # TODO: use tempfile
28
+ def export_text(content, filename):
29
  with open(filename, "w") as f:
30
+ f.write(content)
31
  return gr.File.update(value=filename, visible=True)
32
 
33
 
34
+
35
  with gr.Blocks() as demo:
36
  gr.Markdown(
37
  "# LLM Data Mapper\n\nThis tool will help you map a source CSV to a template CSV, and then generate python code to transform the source CSV into the template CSV. You can edit all of the values, re-run the processes, and download files along the way."
 
48
  label="Upload Template File",
49
  file_types=[".csv"],
50
  file_count="single",
51
+ variant="primary",
52
+ )
53
+ load_template_btn = gr.Button(
54
+ value="Load Example Template File",
55
+ variant="secondary",
56
  )
57
  template_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
58
  upload_template_btn.upload(
59
  fn=process_csv_text, inputs=upload_template_btn, outputs=template_df
60
  )
61
+ load_template_btn.click(lambda _: pd.read_csv('./src/data/actual/template.csv'), upload_template_btn, template_df)
62
  with gr.Column():
63
  upload_source_button = gr.UploadButton(
64
  label="Upload Source File",
65
  file_types=[".csv"],
66
  file_count="single",
67
+ variant="primary",
68
+ )
69
+ load_source_button = gr.Button(
70
+ value="Load Example Source File",
71
+ variant="secondary",
72
  )
73
  source_df = gr.Dataframe(max_rows=MAX_ROWS, interactive=False)
74
  upload_source_button.upload(
75
  fn=process_csv_text, inputs=upload_source_button, outputs=source_df
76
  )
77
+ load_source_button.click(lambda _: pd.read_csv('./src/data/actual/table_A.csv'), upload_source_button, source_df)
78
 
79
  # STEP 2
80
  generate_step_markdown(
src/core.py CHANGED
@@ -76,12 +76,12 @@ def generate_mapping_code(table_mapping_df) -> str:
76
  return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
77
 
78
 
79
- def process_csv_text(temp_file):
80
- """Process a CSV file into a dataframe, either from a string or a file."""
81
- if isinstance(temp_file, str):
82
- df = pd.read_csv(io.StringIO(temp_file))
83
  else:
84
- df = pd.read_csv(temp_file.name)
85
  return df
86
 
87
 
 
76
  return engineer_chain.invoke({"table_mapping": str(table_mapping_df.to_dict())})
77
 
78
 
79
+ def process_csv_text(value):
80
+ """Process a CSV file into a dataframe, either from a string path or a file."""
81
+ if isinstance(value, str):
82
+ df = pd.read_csv(value)
83
  else:
84
+ df = pd.read_csv(value.name)
85
  return df
86
 
87
 
src/data/actual/table_A.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Date_of_Policy,FullName,Insurance_Plan,Policy_No,Monthly_Premium,Department,JobTitle,Policy_Start,Full_Name,Insurance_Type,Policy_Num,Monthly_Cost
2
+ 05/01/2023,John Doe,Gold Plan,AB-12345,150.00,IT,Software Engineer,05/01/2023,John Doe,Gold,AB-12345,150.00
3
+ 05/02/2023,Jane Smith,Silver Plan,CD-67890,100.00,HR,HR Manager,05/02/2023,Jane Smith,Silver,CD-67890,100.00
4
+ 05/03/2023,Michael Brown,Bronze Plan,EF-10111,50.00,Marketing,Marketing Coordinator,05/03/2023,Michael Brown,Bronze,EF-10111,50.00
5
+ 05/04/2023,Alice Johnson,Gold Plan,GH-12121,150.00,Finance,Financial Analyst,05/04/2023,Alice Johnson,Gold,GH-12121,150.00
6
+ 05/05/2023,Bob Wilson,Silver Plan,IJ-13131,100.00,Sales,Sales Executive,05/05/2023,Bob Wilson,Silver,IJ-13131,100.00
7
+ 05/06/2023,Carol Martinez,Bronze Plan,KL-14141,50.00,Operations,Operations Manager,05/06/2023,Carol Martinez,Bronze,KL-14141,50.00
8
+ 05/07/2023,David Anderson,Gold Plan,MN-15151,150.00,Legal,Attorney,05/07/2023,David Anderson,Gold,MN-15151,150.00
9
+ 05/08/2023,Eva Thomas,Silver Plan,OP-16161,100.00,Product,Product Manager,05/08/2023,Eva Thomas,Silver,OP-16161,100.00
10
+ 05/09/2023,Frank Jackson,Bronze Plan,QR-17171,50.00,Engineering,Engineer,05/09/2023,Frank Jackson,Bronze,QR-17171,50.00
11
+ 05/10/2023,Grace White,Gold Plan,ST-18181,150.00,Design,Graphic Designer,05/10/2023,Grace White,Gold,ST-18181,150.00
src/data/actual/table_B.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PolicyDate,Name,PlanType,Policy_ID,PremiumAmount,Hobby,MaritalStatus,StartDate,Employee_Name,Plan_Name,PolicyID,Cost
2
+ 2023-05-01,Doe, John,GoldPackage,AB12345,Photography,Married,2023-05-01,John Doe,Gold,AB12345,150
3
+ 2023-05-02,Smith, Jane,SilverPackage,CD67890,Reading,Single,2023-05-02,Jane Smith,Silver,CD67890,100
4
+ 2023-05-03,Brown, Michael,BronzePackage,EF10111,Traveling,Divorced,2023-05-03,Michael Brown,Bronze,EF10111,50
5
+ 2023-05-04,Johnson, Alice,GoldPackage,GH12121,Cooking,Married,2023-05-04,Alice Johnson,Gold,GH12121,150
6
+ 2023-05-05,Wilson, Bob,SilverPackage,IJ13131,Hiking,Single,2023-05-05,Bob Wilson,Silver,IJ13131,100
7
+ 2023-05-06,Martinez, Carol,BronzePackage,KL14141,Swimming,Divorced,2023-05-06,Carol Martinez,Bronze,KL14141,50
8
+ 2023-05-07,Anderson, David,GoldPackage,MN15151,Gaming,Married,2023-05-07,David Anderson,Gold,MN15151,150
9
+ 2023-05-08,Thomas, Eva,SilverPackage,OP16161,Painting,Single,2023-05-08,Eva Thomas,Silver,OP16161,100
10
+ 2023-05-09,Jackson, Frank,BronzePackage,QR17171,Writing,Divorced,2023-05-09,Frank Jackson,Bronze,QR17171,50
11
+ 2023-05-10,White, Grace,GoldPackage,ST18181,Dancing,Married,2023-05-10,Grace White,Gold,ST18181,150
src/data/actual/template.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Date,EmployeeName,Plan,PolicyNumber,Premium
2
+ 01-05-2023,John Doe,Gold,AB12345,150
3
+ 02-05-2023,Jane Smith,Silver,CD67890,100
4
+ 03-05-2023,Michael Brown,Bronze,EF10111,50
5
+ 04-05-2023,Alice Johnson,Gold,GH12121,150
6
+ 05-05-2023,Bob Wilson,Silver,IJ13131,100
7
+ 06-05-2023,Carol Martinez,Bronze,KL14141,50
8
+ 07-05-2023,David Anderson,Gold,MN15151,150
9
+ 08-05-2023,Eva Thomas,Silver,OP16161,100
10
+ 09-05-2023,Frank Jackson,Bronze,QR17171,50
11
+ 10-05-2023,Grace White,Gold,ST18181,150
src/prompt.py CHANGED
@@ -12,10 +12,10 @@ Head of `target_csv`:
12
  Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.
13
 
14
  For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.
15
- You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.
 
16
 
17
- Remember:
18
- 1. Which column in `target_csv` it maps to. You should consider the semantic meaning of the columns, not just the character similarity.
19
 
20
  Example mappings:
21
  - 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.
 
12
  Your job is to generate a thorough, precise summary of how `source_csv` should be transformed to adhere exactly to the `target_csv` schema.
13
 
14
  For each column in the `source_csv`, you must communicate which column in the `target_csv` it maps to, and how the values in the `source_csv` column should be transformed to match those in the `target_csv`.
15
+ You should consider the semantic meaning of the columns, not just the character similarity or positioning.
16
+ If there is no corresponding column in the `target_csv`, you should write 'DROP_COLUMN' in the `target_csv` column.
17
 
18
+ You can assume the rows are aligned: that is, the first row in `source_csv` corresponds to the first row in `target_csv`, and so on.
 
19
 
20
  Example mappings:
21
  - 'MunICipality' in `source_csv` should map to 'City' in `target_csv`.
src/types.py CHANGED
@@ -9,7 +9,7 @@ class TableMappingEntry(BaseModel):
9
  )
10
  target_column_name: str = Field(
11
  ...,
12
- description="Name of the column in the target table, to which the source column maps.",
13
  )
14
  value_transformations: str = Field(
15
  ...,
 
9
  )
10
  target_column_name: str = Field(
11
  ...,
12
+ description="Name of the column in the target table, to which the source column maps. If there is no target_column, write 'DROP_COLUMN'.",
13
  )
14
  value_transformations: str = Field(
15
  ...,
wandb/run-20230818_001703-qce5cvts/files/requirements.txt ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.8.5
3
+ aiosignal==1.3.1
4
+ altair==5.0.1
5
+ anyio==3.7.1
6
+ appdirs==1.4.4
7
+ appnope==0.1.3
8
+ asttokens==2.2.1
9
+ async-timeout==4.0.3
10
+ attrs==23.1.0
11
+ backcall==0.2.0
12
+ black==23.7.0
13
+ certifi==2023.7.22
14
+ charset-normalizer==3.2.0
15
+ click==8.1.6
16
+ comm==0.1.4
17
+ contourpy==1.1.0
18
+ cycler==0.11.0
19
+ dataclasses-json==0.5.14
20
+ debugpy==1.6.7.post1
21
+ decorator==5.1.1
22
+ docker-pycreds==0.4.0
23
+ exceptiongroup==1.1.3
24
+ executing==1.2.0
25
+ fastapi==0.101.1
26
+ ffmpy==0.3.1
27
+ filelock==3.12.2
28
+ fonttools==4.42.0
29
+ frozenlist==1.4.0
30
+ fsspec==2023.6.0
31
+ gitdb==4.0.10
32
+ gitpython==3.1.32
33
+ gradio-client==0.4.0
34
+ gradio==3.40.1
35
+ greenlet==2.0.2
36
+ h11==0.14.0
37
+ httpcore==0.17.3
38
+ httpx==0.24.1
39
+ huggingface-hub==0.16.4
40
+ idna==3.4
41
+ importlib-metadata==6.8.0
42
+ importlib-resources==6.0.1
43
+ ipykernel==6.25.1
44
+ ipython==8.14.0
45
+ jedi==0.19.0
46
+ jinja2==3.1.2
47
+ jsonschema-specifications==2023.7.1
48
+ jsonschema==4.19.0
49
+ jupyter-client==8.3.0
50
+ jupyter-core==5.3.1
51
+ kiwisolver==1.4.4
52
+ langchain==0.0.264
53
+ langsmith==0.0.22
54
+ linkify-it-py==2.0.2
55
+ markdown-it-py==2.2.0
56
+ markupsafe==2.1.3
57
+ marshmallow==3.20.1
58
+ matplotlib-inline==0.1.6
59
+ matplotlib==3.7.2
60
+ mdit-py-plugins==0.3.3
61
+ mdurl==0.1.2
62
+ multidict==6.0.4
63
+ mypy-extensions==1.0.0
64
+ nest-asyncio==1.5.7
65
+ numexpr==2.8.5
66
+ numpy==1.25.2
67
+ openai==0.27.8
68
+ openapi-schema-pydantic==1.2.4
69
+ orjson==3.9.4
70
+ packaging==23.1
71
+ pandas==2.0.3
72
+ parso==0.8.3
73
+ pathspec==0.11.2
74
+ pathtools==0.1.2
75
+ pexpect==4.8.0
76
+ pickleshare==0.7.5
77
+ pillow==10.0.0
78
+ pip==23.2.1
79
+ platformdirs==3.10.0
80
+ prompt-toolkit==3.0.39
81
+ protobuf==4.24.0
82
+ psutil==5.9.5
83
+ ptyprocess==0.7.0
84
+ pure-eval==0.2.2
85
+ pydantic==1.10.12
86
+ pydub==0.25.1
87
+ pygments==2.16.1
88
+ pyparsing==3.0.9
89
+ python-dateutil==2.8.2
90
+ python-dotenv==1.0.0
91
+ python-multipart==0.0.6
92
+ pytz==2023.3
93
+ pyyaml==6.0.1
94
+ pyzmq==25.1.1
95
+ referencing==0.30.2
96
+ requests==2.31.0
97
+ rpds-py==0.9.2
98
+ semantic-version==2.10.0
99
+ sentry-sdk==1.29.2
100
+ setproctitle==1.3.2
101
+ setuptools==56.0.0
102
+ six==1.16.0
103
+ smmap==5.0.0
104
+ sniffio==1.3.0
105
+ sqlalchemy==2.0.19
106
+ stack-data==0.6.2
107
+ starlette==0.27.0
108
+ tabulate==0.9.0
109
+ tenacity==8.2.3
110
+ tomli==2.0.1
111
+ toolz==0.12.0
112
+ tornado==6.3.3
113
+ tqdm==4.66.1
114
+ traitlets==5.9.0
115
+ typing-extensions==4.7.1
116
+ typing-inspect==0.9.0
117
+ tzdata==2023.3
118
+ uc-micro-py==1.0.2
119
+ urllib3==2.0.4
120
+ uvicorn==0.23.2
121
+ wandb==0.15.8
122
+ wcwidth==0.2.6
123
+ websockets==11.0.3
124
+ yarl==1.9.2
125
+ zipp==3.16.2