Spaces:
Runtime error
Runtime error
File size: 4,833 Bytes
baecba5 6d47c20 e332268 390d8ba baecba5 6d47c20 bfa58d3 ffd7c99 cb31cda baecba5 6d47c20 c60eaaf 24dcd2b f68b438 6d47c20 c60eaaf 6d47c20 ff99bb4 390d8ba 8eae5c4 ff99bb4 6d47c20 390d8ba 8668409 6d47c20 cb31cda 390d8ba cb31cda a2c5ca3 ce2caee a2c5ca3 6d47c20 390d8ba 24dcd2b 6d47c20 8eae5c4 390d8ba 8eae5c4 390d8ba 6d47c20 afe376b 6d47c20 ff99bb4 390d8ba ff99bb4 6d47c20 ff99bb4 6d47c20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import gradio as gr
import pandas as pd
from realtabformer import REaLTabFormer
from scipy.io import arff
import os
rtf_model = REaLTabFormer(
model_type="tabular",
epochs=25, # Default is 200
gradient_accumulation_steps=4)
def generate_data(file, num_samples):
if '.arff' in file.name:
data = arff.loadarff(open(file.name,'rt'))
df = pd.DataFrame(data[0])
elif '.csv' in file.name:
df = pd.read_csv(file.name)
rtf_model.fit(df, num_bootstrap=10) # Default is 500
# Generate synthetic data
samples = rtf_model.sample(n_samples=num_samples)
return samples, samples.to_csv('samples.csv')
def generate_relational_data(parent_file, child_file, join_on):
parent_df = pd.read_csv(parent_file.name)
child_df = pd.read_csv(child_file.name)
#Make sure join_on column exists in both
assert ((join_on in parent_df.columns) and
(join_on in child_df.columns))
rtf_model.fit(parent_df.drop(join_on, axis=1), num_bootstrap=100)
pdir = Path("rtf_parent/")
rtf_model.save(pdir)
# # Get the most recently saved parent model,
# # or a specify some other saved model.
# parent_model_path = pdir / "idXXX"
parent_model_path = sorted([
p for p in pdir.glob("id*") if p.is_dir()],
key=os.path.getmtime)[-1]
child_model = REaLTabFormer(
model_type="relational",
parent_realtabformer_path=parent_model_path,
epochs = 25,
output_max_length=None,
train_size=0.8)
child_model.fit(
df=child_df,
in_df=parent_df,
join_on=join_on,
num_bootstrap=10)
# Generate parent samples.
parent_samples = rtf_model.sample(5)
# Create the unique ids based on the index.
parent_samples.index.name = join_on
parent_samples = parent_samples.reset_index()
# Generate the relational observations.
child_samples = child_model.sample(
input_unique_ids=parent_samples[join_on],
input_df=parent_samples.drop(join_on, axis=1),
gen_batch=5)
return parent_samples, child_samples, gr.update(visible = True), parent_samples.to_csv('parent_samples.csv'), child_samples.to_csv('child_samples.csv')
with gr.Blocks() as demo:
gr.Markdown("""
## REaLTabFormer: Generating Realistic Relational and Tabular Data using Transformers
""")
gr.HTML('''
<p style="margin-bottom: 10px; font-size: 94%">
This is an unofficial demo for REaLTabFormer, an approach that can be used to generate synthetic data from single tabular data using GPT. The demo is based on the <a href='https://github.com/avsolatorio/REaLTabFormer' style='text-decoration: underline;' target='_blank'> Github </a> implementation provided by the authors.
</p>
''')
gr.HTML('''
<p align="center"><img src="https://github.com/avsolatorio/RealTabFormer/raw/main/img/REalTabFormer_Final_EQ.png" style="width:40%"/></p>
''')
with gr.Column():
with gr.Tab("Upload Data as File: Tabular Data"):
data_input_u = gr.File(label = 'Upload Data File (Currently supports CSV and ARFF)', file_types=[".csv", ".arff"])
num_samples = gr.Slider(label="Number of Samples", minimum=5, maximum=100, value=5, step=10)
generate_data_btn = gr.Button('Generate Synthetic Data')
with gr.Tab("Upload Data as File: Relational Data"):
data_input_parent = gr.File(label = 'Upload Data File for Parent Dataset', file_types=[ ".csv"])
data_input_child = gr.File(label = 'Upload Data File for Child Dataset', file_types=[ ".csv"])
join_on = gr.Textbox(label = 'Column name to join on')
generate_data_btn_relational = gr.Button('Generate Synthetic Data')
with gr.Row():
#data_sample = gr.Dataframe(label = "Original Data")
data_output = gr.Dataframe(label = "Synthetic Data")
data_output_file = gr.File(label = "Synthetic Data File")
with gr.Row(visible = False) as child_sample:
data_output_child = gr.Dataframe(label = "Synthetic Data for Child Dataset")
data_output_file_child = gr.File(label = "Synthetic Data File for Child Dataset")
generate_data_btn.click(generate_data, inputs = [data_input_u,num_samples], outputs = [data_output, data_output_file])
generate_data_btn_relational.click(generate_relational_data, inputs = [data_input_parent,data_input_child,join_on], outputs = [data_output, data_output_child, child_sample, data_output_file, data_output_file_child])
examples = gr.Examples(examples=[['diabetes.arff',5], ["titanic.csv", 15]],inputs = [data_input_u,num_samples], outputs = [data_output,data_output_file], cache_examples = True, fn = generate_data)
demo.launch() |