rolaser-demo / app.py
lydianish's picture
Update app.py
c2cb68d verified
raw
history blame
5.07 kB
import os
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.preprocessing import normalize
from rolaser import RoLaserEncoder
@st.cache_resource(show_spinner=False)
def load_models():
laser_checkpoint = f"{os.environ['LASER']}/models/laser2.pt"
laser_vocab = f"{os.environ['LASER']}/models/laser2.cvocab"
laser_tokenizer = 'spm'
laser_model = RoLaserEncoder(model_path=laser_checkpoint, vocab=laser_vocab, tokenizer=laser_tokenizer)
rolaser_checkpoint = f"{os.environ['ROLASER']}/models/rolaser.pt"
rolaser_vocab = f"{os.environ['ROLASER']}/models/rolaser.cvocab"
rolaser_tokenizer = 'roberta'
rolaser_model = RoLaserEncoder(model_path=rolaser_checkpoint, vocab=rolaser_vocab, tokenizer=rolaser_tokenizer)
c_rolaser_checkpoint = f"{os.environ['ROLASER']}/models/c-rolaser.pt"
c_rolaser_vocab = f"{os.environ['ROLASER']}/models/c-rolaser.cvocab"
c_rolaser_tokenizer = 'char'
c_rolaser_model = RoLaserEncoder(model_path=c_rolaser_checkpoint, vocab=c_rolaser_vocab, tokenizer=c_rolaser_tokenizer)
return laser_model, rolaser_model, c_rolaser_model
@st.cache_data(show_spinner=False)
def load_sample_data():
STD_SENTENCES = ['See you tomorrow.'] * 10
UGC_SENTENCES = [
'See you t03orro3.',
'C. U. tomorrow.',
'sea you tomorrow.',
'See yo utomorrow.',
'See you tmrw.',
'See you tkmoerow.',
'Cu 2moro.',
'See yow tomorrow.',
'C. Yew tomorrow.',
'c ya 2morrow.'
]
return STD_SENTENCES, UGC_SENTENCES
def main():
sample_std, sample_ugc = load_sample_data()
laser_model, rolaser_model, c_rolaser_model = load_models()
st.title('Pairwise Cosine Distance Calculator')
info = '''
:bookmark: **Paper:** [Making Sentence Embeddings Robust to User-Generated Content (Nishimwe et al., 2024)](https://arxiv.org/abs/2403.17220)
:link: **Github:** [https://github.com/lydianish/RoLASER](https://github.com/lydianish/RoLASER)
'''
st.markdown(info)
st.header('Standard and Non-standard Text Input Pairs')
cols = st.columns(3)
num_pairs = cols[1].number_input('Number of Text Input Pairs (1-10):', min_value=1, max_value=10, value=5)
with st.form('text_input_form'):
col1, col2 = st.columns(2)
col1.write('Enter standard text here:')
col2.write('Enter non-standard text here:')
std_text_inputs = []
ugc_text_inputs = []
for i in range(num_pairs):
col1, col2 = st.columns(2)
with col1:
text_input1 = st.text_input('Enter standard text here:', key=f'std{i}', value=sample_std[i], label_visibility='collapsed')
std_text_inputs.append(text_input1)
with col2:
text_input2 = st.text_input('Enter non-standard text here:', key=f'ugc{i}', value=sample_ugc[i], label_visibility='collapsed')
ugc_text_inputs.append(text_input2)
st.caption('*The models are case-insensitive: all text will be lowercased.*')
st.form_submit_button('Compute')
X_std_laser = normalize(laser_model.encode(std_text_inputs))
X_ugc_laser = normalize(laser_model.encode(ugc_text_inputs))
X_cos_laser = paired_cosine_distances(X_std_laser, X_ugc_laser)
X_std_rolaser = normalize(rolaser_model.encode(std_text_inputs))
X_ugc_rolaser = normalize(rolaser_model.encode(ugc_text_inputs))
X_cos_rolaser = paired_cosine_distances(X_std_rolaser, X_ugc_rolaser)
X_std_c_rolaser = normalize(c_rolaser_model.encode(std_text_inputs))
X_ugc_c_rolaser = normalize(c_rolaser_model.encode(ugc_text_inputs))
X_cos_c_rolaser = paired_cosine_distances(X_std_c_rolaser, X_ugc_c_rolaser)
outputs = pd.DataFrame(columns=[ 'model', 'pair', 'ugc', 'std', 'cos'])
outputs['model'] = np.repeat(['LASER', 'RoLASER', 'c-RoLASER'], num_pairs)
outputs['pair'] = np.tile(np.arange(1,num_pairs+1), 3)
outputs['std'] = np.tile(std_text_inputs, 3)
outputs['ugc'] = np.tile(ugc_text_inputs, 3)
outputs['cos'] = np.concatenate([X_cos_laser, X_cos_rolaser, X_cos_c_rolaser])
st.header('Cosine Distance Scores')
st.caption('*This box plot is interactive: Hover on the boxes to display values. Click on the legend items to filter models.*')
fig = px.bar(outputs, x='pair', y='cos', color='model', barmode='group', hover_data=['ugc', 'std'])
fig.update_xaxes(title_text='Text Input Pair')
fig.update_yaxes(title_text='Cosine Distance')
st.plotly_chart(fig, use_container_width=True)
st.header('Average Cosine Distance Scores')
st.caption('*This data table is interactive: Click on a column header to sort values.*')
fig = px.box(outputs, x='model', y='cos', color='model')
fig.update_xaxes(title_text='Model')
fig.update_yaxes(title_text='Cosine Distance')
st.plotly_chart(fig, use_container_width=True)
if __name__ == "__main__":
main()