darkproger commited on
Commit
b2c2c22
1 Parent(s): b1bc515

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -1
app.py CHANGED
@@ -1,12 +1,23 @@
1
  from datasets import load_dataset
2
  import pandas as pd
3
  import streamlit as st
 
 
4
 
5
  st.set_page_config(layout="wide")
6
 
7
  with st.sidebar:
8
  subset = st.selectbox('Flores eng_Latn-ukr_Cyrl subset', ('dev', 'devtest'))
9
 
10
- dataset = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
 
 
 
 
 
 
 
 
 
11
  st.dataframe(pd.DataFrame(dataset[subset]))
12
 
 
1
  from datasets import load_dataset
2
  import pandas as pd
3
  import streamlit as st
4
+ from transformers import AutoTokenizer
5
+ import matplotlib.pyplot as plt
6
 
7
  st.set_page_config(layout="wide")
8
 
9
  with st.sidebar:
10
  subset = st.selectbox('Flores eng_Latn-ukr_Cyrl subset', ('dev', 'devtest'))
11
 
12
+ tokenizer = transformers.AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
13
+ flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
14
+ dataset = flores[subset]
15
+
16
+ fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,5))
17
+ axl.hist(dataaset.map(lambda x: {'num_tokens':len(tok(x['sentence_eng_Latn'])['input_ids'])})['num_tokens'])
18
+ axl.set_title('eng mistral tokens')
19
+ axr.hist(dataaset.map(lambda x: {'num_tokens':len(tok(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens'])
20
+ axr.set_title('ukr mistral tokens')
21
+
22
  st.dataframe(pd.DataFrame(dataset[subset]))
23