5w4n commited on
Commit
116574f
1 Parent(s): b1e9d2d

Move the token results from sidebar

Browse files
Files changed (1) hide show
  1. app.py +52 -52
app.py CHANGED
@@ -93,59 +93,59 @@ with st.sidebar:
93
  label_visibility="collapsed",
94
  )
95
 
96
- st.header("Example Text")
97
- with st.spinner("Loading example text..."):
98
- reload_example_text_data(selected_language)
99
- st.table(st.session_state.examplesdf)
100
- st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
101
 
102
- tokenizer_to_num_tokens = defaultdict(list)
103
- for _, row in tqdm.tqdm(val_data.iterrows(), total=len(val_data)):
104
- text = row["text"]
105
- for tokenizer_name in selected_tokenizers:
106
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
107
- num_tokens = len(tokenizer(text)["input_ids"])
108
- tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
109
 
110
- if selected_figure == "Boxplot":
111
- fig = go.Figure()
112
- for tokenizer_name in selected_tokenizers:
113
- fig.add_trace(
114
- go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
115
- )
116
- fig.update_layout(
117
- title=f"Distribution of Number of Tokens for Selected Tokenizers",
118
- xaxis_title="Tokenizer",
119
- yaxis_title="Number of Tokens",
120
  )
121
- st.plotly_chart(fig)
122
- elif selected_figure == "Histogram":
123
- fig = make_subplots(
124
- rows=len(selected_tokenizers), cols=1, subplot_titles=selected_tokenizers
125
- )
126
- for i, tokenizer_name in enumerate(selected_tokenizers):
127
- fig.add_trace(
128
- go.Histogram(
129
- x=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name
130
- ),
131
- row=i + 1,
132
- col=1,
133
- )
134
- fig.update_layout(
135
- height=200 * len(selected_tokenizers),
136
- title_text="Histogram of Number of Tokens",
137
- )
138
- st.plotly_chart(fig)
139
- elif selected_figure == "Scatterplot":
140
- df = pd.DataFrame(tokenizer_to_num_tokens)
141
- fig = px.scatter_matrix(
142
- df,
143
- dimensions=selected_tokenizers,
144
- color_discrete_sequence=px.colors.qualitative.Plotly,
145
- )
146
- fig.update_layout(
147
- title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
148
- width=800,
149
- height=800,
150
  )
151
- st.plotly_chart(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  label_visibility="collapsed",
94
  )
95
 
96
+ st.header("Example Text")
97
+ with st.spinner("Loading example text..."):
98
+ reload_example_text_data(selected_language)
99
+ st.table(st.session_state.examplesdf)
100
+ st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
101
 
102
+ tokenizer_to_num_tokens = defaultdict(list)
103
+ for _, row in tqdm.tqdm(val_data.iterrows(), total=len(val_data)):
104
+ text = row["text"]
105
+ for tokenizer_name in selected_tokenizers:
106
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
107
+ num_tokens = len(tokenizer(text)["input_ids"])
108
+ tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
109
 
110
+ if selected_figure == "Boxplot":
111
+ fig = go.Figure()
112
+ for tokenizer_name in selected_tokenizers:
113
+ fig.add_trace(
114
+ go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
 
 
 
 
 
115
  )
116
+ fig.update_layout(
117
+ title=f"Distribution of Number of Tokens for Selected Tokenizers",
118
+ xaxis_title="Tokenizer",
119
+ yaxis_title="Number of Tokens",
120
+ )
121
+ st.plotly_chart(fig)
122
+ elif selected_figure == "Histogram":
123
+ fig = make_subplots(
124
+ rows=len(selected_tokenizers), cols=1, subplot_titles=selected_tokenizers
125
+ )
126
+ for i, tokenizer_name in enumerate(selected_tokenizers):
127
+ fig.add_trace(
128
+ go.Histogram(
129
+ x=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name
130
+ ),
131
+ row=i + 1,
132
+ col=1,
 
 
 
 
 
 
 
 
 
 
 
 
133
  )
134
+ fig.update_layout(
135
+ height=200 * len(selected_tokenizers),
136
+ title_text="Histogram of Number of Tokens",
137
+ )
138
+ st.plotly_chart(fig)
139
+ elif selected_figure == "Scatterplot":
140
+ df = pd.DataFrame(tokenizer_to_num_tokens)
141
+ fig = px.scatter_matrix(
142
+ df,
143
+ dimensions=selected_tokenizers,
144
+ color_discrete_sequence=px.colors.qualitative.Plotly,
145
+ )
146
+ fig.update_layout(
147
+ title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
148
+ width=800,
149
+ height=800,
150
+ )
151
+ st.plotly_chart(fig)