Spaces:

Annorita
/

tokenizer_comparison

Sleeping

File size: 1,796 Bytes

5ff29be
35996ec
5ff29be
ae7ad9f
ab98424
6893866
35996ec
 
96a0e76
ae7ad9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96a0e76
1459d42
ab98424
1459d42
ab98424
ae7ad9f
 
 
 
 
 
 
 
35996ec
ae7ad9f
 
35996ec
ae7ad9f
 
 
35996ec
ae7ad9f
 
 
1c58aa1
ae7ad9f
 
224f5e0
ae7ad9f

import streamlit as st
from utils import get_res


st.sidebar.title('Tokenizers demo')

#x = st.slider('Select a value')
#st.write(x, 'squared is', x * x)

#st.sidebar.subheader('Choose the tokenizer', divider='grey')
#option = st.sidebar.selectbox(
#    'model_name',
#    ['deepseek-ai/deepseek-coder-1.3b-instruct',
#     'bigcode/starcoder'])

model_name_A = st.sidebar.text_input('Model Name A', 'deepseek-ai/deepseek-coder-1.3b-instruct')
model_name_B = st.sidebar.text_input('Model Name B', 'deepseek-ai/deepseek-coder-1.3b-instruct')

model_option = ['deepseek-ai/deepseek-coder-1.3b-instruct',
             'MediaTek-Research/Breeze-7B-Instruct-64k-v0_1',
             'microsoft/phi-2']

with st.sidebar.expander("Models that you might want"):
    for m in model_option:
        st.write(m)


#'Your choice:', model_name

st.sidebar.subheader('Write the input sentence', divider='grey')
input_data = st.sidebar.text_input('Input Sentence', 'Hello sunshine!!!')


col1, col2 = st.columns(2)

with col1:
    st.subheader(model_name_A, divider='grey')
    res, token_num = get_res(model_name=model_name_A, input_sentence=input_data, single_print=False)

    st.subheader('Tokenized result')
    st.markdown(res, unsafe_allow_html=True)

    st.subheader('Number of tokens')
    st.markdown(f'<span style="font-size:1.875em">{str(token_num)}</span>', 
                unsafe_allow_html=True)

with col2:
    st.subheader(model_name_B, divider='grey')
    res, token_num = get_res(model_name=model_name_B, input_sentence=input_data, single_print=False)

    st.subheader('Tokenized result')
    st.markdown(res, unsafe_allow_html=True)

    st.subheader('Number of tokens')
    st.markdown(f'<span style="font-size:1.875em">{str(token_num)}</span>', 
                unsafe_allow_html=True)