96abhishekarora's picture
Update app.py
5f4aff6 verified
raw
history blame
3.51 kB
import streamlit as st
import pandas as pd
import linktransformer as lt
# Function to convert DataFrame to CSV for download
def convert_df_to_csv(df):
return df.to_csv().encode('utf-8')
st.title('Merge Dataframes using LinkTransformer')
st.write('LinkTransformer supports several AI-powered data wrangling operations - here is an example that allows you to use LLMs to merge data.')
# Function to load DataFrame
def load_dataframe(upload):
##if csv is uploaded use read_csv to load the data , otherwise use read_excel
if upload is not None:
if upload.name.endswith('csv'):
return pd.read_csv(upload)
else:
return pd.read_excel(upload)
else:
return pd.DataFrame()
# Options for DataFrame 1
df1_upload = st.file_uploader("Upload DataFrame 1 (CSV)", type=['csv'], key='df1_upload')
# Options for DataFrame 2
df2_upload = st.file_uploader("Upload DataFrame 2 (CSV)", type=['csv'], key='df2_upload')
# Load and display the DataFrames
df1 = load_dataframe(df1_upload)
df2 = load_dataframe(df2_upload)
if df1 is not None:
st.write("DataFrame 1 Preview:")
st.dataframe(df1.head())
if df2 is not None:
st.write("DataFrame 2 Preview:")
st.dataframe(df2.head())
# Model selection
model_path = st.text_input("Model path (HuggingFace)", value="all-MiniLM-L6-v2")
st.write("We have trained several record linkage models! Just copy the Hugging Face model path from the [model zoo](https://linktransformer.github.io/).")
##More on model selection available on https://linktransformer.github.io/
if df1_upload is not None and df2_upload is not None:
# Checkbox for columns to match on
if not df1.empty and not df2.empty:
columns_df1 = df1.columns.tolist()
columns_df2 = df2.columns.tolist()
selected_columns_df1 = st.multiselect("Select columns from DataFrame 1 to match on:", columns_df1, default=columns_df1[0])
selected_columns_df2 = st.multiselect("Select columns from DataFrame 2 to match on:", columns_df2, default=columns_df2[0])
# Perform merge
if st.button("Merge DataFrames"):
model=lt.LinkTransformer(model_path)
df_lm_matched = lt.merge(df1, df2, merge_type='1:m', on=None, model=model, left_on=selected_columns_df1, right_on=selected_columns_df2)
st.write("Merged DataFrame Preview:")
st.dataframe(df_lm_matched.head())
# Download button for merged DataFrame
csv = convert_df_to_csv(df_lm_matched)
st.download_button(
label="Download as CSV",
data=csv,
file_name='merged_dataframe.csv',
mime='text/csv',
)
else:
st.write("It appears that your dataframes are empty. Please upload valid dataframes.")
else:
st.write("Please upload or enter paths for both DataFrames.")
##Add website and citation
st.write("Note that this space only supports CPU usage and is only recommended on small datasets. If you have access to the GPU, check out our python [package](https://github.com/dell-research-harvard/linktransformer/)!")
st.write("For more information and advanced usage, please visit the [LinkTransformer website](https://linktransformer.github.io/).")
st.write("If you use LinkTransformer in your research, please cite the following paper: [LinkTransformer: A Unified Package for Record Linkage with Transformer Language Models](https://arxiv.org/abs/2309.00789)")