from datasets import load_dataset import pandas as pd import streamlit as st from transformers import AutoTokenizer import matplotlib.pyplot as plt st.set_page_config(layout="wide") with st.sidebar: subset = st.selectbox('Flores eng_Latn-ukr_Cyrl subset', ('dev', 'devtest')) tokenizer = transformers.AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl") dataset = flores[subset] fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,5)) axl.hist(dataaset.map(lambda x: {'num_tokens':len(tok(x['sentence_eng_Latn'])['input_ids'])})['num_tokens']) axl.set_title('eng mistral tokens') axr.hist(dataaset.map(lambda x: {'num_tokens':len(tok(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens']) axr.set_title('ukr mistral tokens') st.dataframe(pd.DataFrame(dataset[subset]))